From 40e64432ce98f4ee5671d39fbf909ef3f3592366 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 25 Apr 2019 19:03:14 -0400 Subject: dt-bindings: iio: tsl2583: convert bindings to YAML format Convert the tsl2583 device tree bindings to the new YAML format. Signed-off-by: Brian Masney Reviewed-by: Rob Herring Signed-off-by: Jonathan Cameron --- .../devicetree/bindings/iio/light/tsl2583.txt | 25 ------------ .../devicetree/bindings/iio/light/tsl2583.yaml | 46 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 25 deletions(-) delete mode 100644 Documentation/devicetree/bindings/iio/light/tsl2583.txt create mode 100644 Documentation/devicetree/bindings/iio/light/tsl2583.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.txt b/Documentation/devicetree/bindings/iio/light/tsl2583.txt deleted file mode 100644 index 059dffa1829a..000000000000 --- a/Documentation/devicetree/bindings/iio/light/tsl2583.txt +++ /dev/null @@ -1,25 +0,0 @@ -* TAOS TSL 2580/2581/2583 ALS sensor - -Required properties: - - - compatible: Should be one of - "amstaos,tsl2580" - "amstaos,tsl2581" - "amstaos,tsl2583" - - reg: the I2C address of the device - -Optional properties: - - - interrupts: the sole interrupt generated by the device - - Refer to interrupt-controller/interrupts.txt for generic interrupt client - node bindings. - - - vcc-supply: phandle to the regulator that provides power to the sensor. - -Example: - -tsl2581@29 { - compatible = "amstaos,tsl2581"; - reg = <0x29>; -}; diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.yaml b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml new file mode 100644 index 000000000000..e86ef64ecf03 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/light/tsl2583.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: AMS/TAOS Ambient Light Sensor (ALS) + +maintainers: + - Brian Masney + +description: | + Ambient light sensing with an i2c interface. + +properties: + compatible: + enum: + - amstaos,tsl2580 + - amstaos,tsl2581 + - amstaos,tsl2583 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + vcc-supply: + description: Regulator that provides power to the sensor + +required: + - compatible + - reg + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + light-sensor@29 { + compatible = "amstaos,tsl2581"; + reg = <0x29>; + }; + }; +... -- cgit From 17b62779cbe40773e10a83af000e51c29b764575 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 25 Apr 2019 19:03:13 -0400 Subject: dt-bindings: iio: tsl2772: convert bindings to YAML format Convert the tsl2772 device tree bindings to the new YAML format. Signed-off-by: Brian Masney Reviewed-by: Rob Herring Signed-off-by: Jonathan Cameron --- .../devicetree/bindings/iio/light/tsl2772.txt | 42 ----------- .../devicetree/bindings/iio/light/tsl2772.yaml | 83 ++++++++++++++++++++++ 2 files changed, 83 insertions(+), 42 deletions(-) delete mode 100644 Documentation/devicetree/bindings/iio/light/tsl2772.txt create mode 100644 Documentation/devicetree/bindings/iio/light/tsl2772.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.txt b/Documentation/devicetree/bindings/iio/light/tsl2772.txt deleted file mode 100644 index 1c5e6f17a1df..000000000000 --- a/Documentation/devicetree/bindings/iio/light/tsl2772.txt +++ /dev/null @@ -1,42 +0,0 @@ -* AMS/TAOS ALS and proximity sensor - -Required properties: - - - compatible: Should be one of - "amstaos,tsl2571" - "amstaos,tsl2671" - "amstaos,tmd2671" - "amstaos,tsl2771" - "amstaos,tmd2771" - "amstaos,tsl2572" - "amstaos,tsl2672" - "amstaos,tmd2672" - "amstaos,tsl2772" - "amstaos,tmd2772" - "avago,apds9930" - - reg: the I2C address of the device - -Optional properties: - - - amstaos,proximity-diodes - proximity diodes to enable. <0>, <1>, or <0 1> - are the only valid values. - - led-max-microamp - current for the proximity LED. Must be 100000, 50000, - 25000, or 13000. - - vdd-supply: phandle to the regulator that provides power to the sensor. - - vddio-supply: phandle to the regulator that provides power to the bus. - - interrupts: the sole interrupt generated by the device - - Refer to interrupt-controller/interrupts.txt for generic interrupt client - node bindings. - -Example: - -tsl2772@39 { - compatible = "amstaos,tsl2772"; - reg = <0x39>; - interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>; - vdd-supply = <&pm8941_l17>; - vddio-supply = <&pm8941_lvs1>; - amstaos,proximity-diodes = <0>; - led-max-microamp = <100000>; -}; diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.yaml b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml new file mode 100644 index 000000000000..ed2c3d5eadf5 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/light/tsl2772.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: AMS/TAOS Ambient Light Sensor (ALS) and Proximity Detector + +maintainers: + - Brian Masney + +description: | + Ambient light sensing and proximity detection with an i2c interface. + https://ams.com/documents/20143/36005/TSL2772_DS000181_2-00.pdf + +properties: + compatible: + enum: + - amstaos,tsl2571 + - amstaos,tsl2671 + - amstaos,tmd2671 + - amstaos,tsl2771 + - amstaos,tmd2771 + - amstaos,tsl2572 + - amstaos,tsl2672 + - amstaos,tmd2672 + - amstaos,tsl2772 + - amstaos,tmd2772 + - avago,apds9930 + + reg: + maxItems: 1 + + amstaos,proximity-diodes: + description: Proximity diodes to enable + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32-array + - minItems: 1 + maxItems: 2 + items: + minimum: 0 + maximum: 1 + + interrupts: + maxItems: 1 + + led-max-microamp: + description: Current for the proximity LED + enum: + - 13000 + - 25000 + - 50000 + - 100000 + + vdd-supply: + description: Regulator that provides power to the sensor + + vddio-supply: + description: Regulator that provides power to the bus + +required: + - compatible + - reg + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + sensor@39 { + compatible = "amstaos,tsl2772"; + reg = <0x39>; + interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>; + vdd-supply = <&pm8941_l17>; + vddio-supply = <&pm8941_lvs1>; + amstaos,proximity-diodes = <0>; + led-max-microamp = <100000>; + }; + }; +... -- cgit From 8b7a6a35746292e140304107e60f9a980416abf7 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 26 Apr 2019 13:39:16 +0200 Subject: iio: cros_ec: add 'id' sysfs entry This new sysfs entry is used to interpret ring buffer information, mainly by Android sensor HAL. It expand to all sensors, the documentation about 'id' we can found in Documentation/ABI/testing/sysfs-bus-iio-cros-ec. Also fix typo in docs, I replace 'Septembre' by 'September'. Signed-off-by: Gwendal Grignou Signed-off-by: Fabien Lahoudere Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio-cros-ec | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec index 0e95c2ca105c..6158f831c761 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec +++ b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec @@ -18,11 +18,11 @@ Description: values are 'base' and 'lid'. What: /sys/bus/iio/devices/iio:deviceX/id -Date: Septembre 2017 +Date: September 2017 KernelVersion: 4.14 Contact: linux-iio@vger.kernel.org Description: - This attribute is exposed by the CrOS EC legacy accelerometer - driver and represents the sensor ID as exposed by the EC. This - ID is used by the Android sensor service hardware abstraction - layer (sensor HAL) through the Android container on ChromeOS. + This attribute is exposed by the CrOS EC sensors driver and + represents the sensor ID as exposed by the EC. This ID is used + by the Android sensor service hardware abstraction layer (sensor + HAL) through the Android container on ChromeOS. -- cgit From 7e527e11d672e90f1a3dc8de84e0bfaccda15bba Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:00 +0300 Subject: mei: docs: move documentation under driver-api Move mei driver documentation under Documentation/driver-api/ Perform some minimal formating changes to produce correct sphinx rendering and add index.rst Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/mei/index.rst | 22 ++ Documentation/driver-api/mei/mei-client-bus.rst | 152 +++++++++++++ Documentation/driver-api/mei/mei.rst | 250 ++++++++++++++++++++ Documentation/misc-devices/mei/mei-client-bus.txt | 141 ------------ Documentation/misc-devices/mei/mei.txt | 266 ---------------------- 6 files changed, 425 insertions(+), 407 deletions(-) create mode 100644 Documentation/driver-api/mei/index.rst create mode 100644 Documentation/driver-api/mei/mei-client-bus.rst create mode 100644 Documentation/driver-api/mei/mei.rst delete mode 100644 Documentation/misc-devices/mei/mei-client-bus.txt delete mode 100644 Documentation/misc-devices/mei/mei.txt (limited to 'Documentation') diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index d26308af6036..0dbaa987aa11 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -42,6 +42,7 @@ available subsections can be seen below. target mtdnand miscellaneous + mei/index w1 rapidio s390-drivers diff --git a/Documentation/driver-api/mei/index.rst b/Documentation/driver-api/mei/index.rst new file mode 100644 index 000000000000..35c1117d8366 --- /dev/null +++ b/Documentation/driver-api/mei/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: + +=================================================== +Intel(R) Management Engine Interface (Intel(R) MEI) +=================================================== + +**Copyright** |copy| 2019 Intel Corporation + + +.. only:: html + + .. class:: toc-title + + Table of Contents + +.. toctree:: + :maxdepth: 2 + + mei + mei-client-bus diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst new file mode 100644 index 000000000000..a26a85453bdf --- /dev/null +++ b/Documentation/driver-api/mei/mei-client-bus.rst @@ -0,0 +1,152 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================== +Intel(R) Management Engine (ME) Client bus API +============================================== + + +Rationale +========= + +MEI misc character device is useful for dedicated applications to send and receive +data to the many FW appliance found in Intel's ME from the user space. +However for some of the ME functionalities it make sense to leverage existing software +stack and expose them through existing kernel subsystems. + +In order to plug seamlessly into the kernel device driver model we add kernel virtual +bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers +for the various MEI features as a stand alone entities found in their respective subsystem. +Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to +the existing code. + + +MEI CL bus API +============== + +A driver implementation for an MEI Client is very similar to existing bus +based device drivers. The driver registers itself as an MEI CL bus driver through +the ``struct mei_cl_driver`` structure: + +.. code-block:: C + + struct mei_cl_driver { + struct device_driver driver; + const char *name; + + const struct mei_cl_device_id *id_table; + + int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id); + int (*remove)(struct mei_cl_device *dev); + }; + + struct mei_cl_id { + char name[MEI_NAME_SIZE]; + kernel_ulong_t driver_info; + }; + +The mei_cl_id structure allows the driver to bind itself against a device name. + +To actually register a driver on the ME Client bus one must call the mei_cl_add_driver() +API. This is typically called at module init time. + +Once registered on the ME Client bus, a driver will typically try to do some I/O on +this bus and this should be done through the mei_cl_send() and mei_cl_recv() +routines. The latter is synchronous (blocks and sleeps until data shows up). +In order for drivers to be notified of pending events waiting for them (e.g. +an Rx event) they can register an event handler through the +mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event +will trigger an event handler call and the driver implementation is supposed +to call mei_recv() from the event handler in order to fetch the pending +received buffers. + + +Example +======= + +As a theoretical example let's pretend the ME comes with a "contact" NFC IP. +The driver init and exit routines for this device would look like: + +.. code-block:: C + + #define CONTACT_DRIVER_NAME "contact" + + static struct mei_cl_device_id contact_mei_cl_tbl[] = { + { CONTACT_DRIVER_NAME, }, + + /* required last entry */ + { } + }; + MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl); + + static struct mei_cl_driver contact_driver = { + .id_table = contact_mei_tbl, + .name = CONTACT_DRIVER_NAME, + + .probe = contact_probe, + .remove = contact_remove, + }; + + static int contact_init(void) + { + int r; + + r = mei_cl_driver_register(&contact_driver); + if (r) { + pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n"); + return r; + } + + return 0; + } + + static void __exit contact_exit(void) + { + mei_cl_driver_unregister(&contact_driver); + } + + module_init(contact_init); + module_exit(contact_exit); + +And the driver's simplified probe routine would look like that: + +.. code-block:: C + + int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id) + { + struct contact_driver *contact; + + [...] + mei_cl_enable_device(dev); + + mei_cl_register_event_cb(dev, contact_event_cb, contact); + + return 0; + } + +In the probe routine the driver first enable the MEI device and then registers +an ME bus event handler which is as close as it can get to registering a +threaded IRQ handler. +The handler implementation will typically call some I/O routine depending on +the pending events: + +#define MAX_NFC_PAYLOAD 128 + +.. code-block:: C + + static void contact_event_cb(struct mei_cl_device *dev, u32 events, + void *context) + { + struct contact_driver *contact = context; + + if (events & BIT(MEI_EVENT_RX)) { + u8 payload[MAX_NFC_PAYLOAD]; + int payload_size; + + payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD); + if (payload_size <= 0) + return; + + /* Hook to the NFC subsystem */ + nfc_hci_recv_frame(contact->hdev, payload, payload_size); + } + } diff --git a/Documentation/driver-api/mei/mei.rst b/Documentation/driver-api/mei/mei.rst new file mode 100644 index 000000000000..5aa3a5e6496a --- /dev/null +++ b/Documentation/driver-api/mei/mei.rst @@ -0,0 +1,250 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Introduction +============ + +The Intel Management Engine (Intel ME) is an isolated and protected computing +resource (Co-processor) residing inside certain Intel chipsets. The Intel ME +provides support for computer/IT management features. The feature set +depends on the Intel chipset SKU. + +The Intel Management Engine Interface (Intel MEI, previously known as HECI) +is the interface between the Host and Intel ME. This interface is exposed +to the host as a PCI device. The Intel MEI Driver is in charge of the +communication channel between a host application and the Intel ME feature. + +Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and +each client has its own protocol. The protocol is message-based with a +header and payload up to 512 bytes. + +Prominent usage of the Intel ME Interface is to communicate with Intel(R) +Active Management Technology (Intel AMT) implemented in firmware running on +the Intel ME. + +Intel AMT provides the ability to manage a host remotely out-of-band (OOB) +even when the operating system running on the host processor has crashed or +is in a sleep state. + +Some examples of Intel AMT usage are: + - Monitoring hardware state and platform components + - Remote power off/on (useful for green computing or overnight IT + maintenance) + - OS updates + - Storage of useful platform information such as software assets + - Built-in hardware KVM + - Selective network isolation of Ethernet and IP protocol flows based + on policies set by a remote management console + - IDE device redirection from remote management console + +Intel AMT (OOB) communication is based on SOAP (deprecated +starting with Release 6.0) over HTTP/S or WS-Management protocol over +HTTP/S that are received from a remote management console application. + +For more information about Intel AMT: +http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + + +Intel MEI Driver +================ + +The driver exposes a misc device called /dev/mei. + +An application maintains communication with an Intel ME feature while +/dev/mei is open. The binding to a specific feature is performed by calling +MEI_CONNECT_CLIENT_IOCTL, which passes the desired UUID. +The number of instances of an Intel ME feature that can be opened +at the same time depends on the Intel ME feature, but most of the +features allow only a single instance. + +The Intel AMT Host Interface (Intel AMTHI) feature supports multiple +simultaneous user connected applications. The Intel MEI driver +handles this internally by maintaining request queues for the applications. + +The driver is transparent to data that are passed between firmware feature +and host application. + +Because some of the Intel ME features can change the system +configuration, the driver by default allows only a privileged +user to access it. + +A code snippet for an application communicating with Intel AMTHI client: + +.. code-block:: C + + struct mei_connect_client_data data; + fd = open(MEI_DEVICE); + + data.d.in_client_uuid = AMTHI_UUID; + + ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data); + + printf("Ver=%d, MaxLen=%ld\n", + data.d.in_client_uuid.protocol_version, + data.d.in_client_uuid.max_msg_length); + + [...] + + write(fd, amthi_req_data, amthi_req_data_len); + + [...] + + read(fd, &amthi_res_data, amthi_res_data_len); + + [...] + close(fd); + + +IOCTLs +====== + +The Intel MEI Driver supports the following IOCTL commands: + IOCTL_MEI_CONNECT_CLIENT Connect to firmware Feature (client). + + usage: + struct mei_connect_client_data clientData; + ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &clientData); + + inputs: + mei_connect_client_data struct contain the following + input field: + + in_client_uuid - UUID of the FW Feature that needs + to connect to. + outputs: + out_client_properties - Client Properties: MTU and Protocol Version. + + error returns: + EINVAL Wrong IOCTL Number + ENODEV Device or Connection is not initialized or ready. (e.g. Wrong UUID) + ENOMEM Unable to allocate memory to client internal data. + EFAULT Fatal Error (e.g. Unable to access user input data) + EBUSY Connection Already Open + + Notes: + max_msg_length (MTU) in client properties describes the maximum + data that can be sent or received. (e.g. if MTU=2K, can send + requests up to bytes 2k and received responses up to 2k bytes). + + IOCTL_MEI_NOTIFY_SET: enable or disable event notifications + + Usage: + uint32_t enable; + ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable); + + Inputs: + uint32_t enable = 1; + or + uint32_t enable[disable] = 0; + + Error returns: + EINVAL Wrong IOCTL Number + ENODEV Device is not initialized or the client not connected + ENOMEM Unable to allocate memory to client internal data. + EFAULT Fatal Error (e.g. Unable to access user input data) + EOPNOTSUPP if the device doesn't support the feature + + Notes: + The client must be connected in order to enable notification events + + + IOCTL_MEI_NOTIFY_GET : retrieve event + + Usage: + uint32_t event; + ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event); + + Outputs: + 1 - if an event is pending + 0 - if there is no even pending + + Error returns: + EINVAL Wrong IOCTL Number + ENODEV Device is not initialized or the client not connected + ENOMEM Unable to allocate memory to client internal data. + EFAULT Fatal Error (e.g. Unable to access user input data) + EOPNOTSUPP if the device doesn't support the feature + + Notes: + The client must be connected and event notification has to be enabled + in order to receive an event + + +Intel ME Applications +===================== + + 1) Intel Local Management Service (Intel LMS) + + Applications running locally on the platform communicate with Intel AMT Release + 2.0 and later releases in the same way that network applications do via SOAP + over HTTP (deprecated starting with Release 6.0) or with WS-Management over + SOAP over HTTP. This means that some Intel AMT features can be accessed from a + local application using the same network interface as a remote application + communicating with Intel AMT over the network. + + When a local application sends a message addressed to the local Intel AMT host + name, the Intel LMS, which listens for traffic directed to the host name, + intercepts the message and routes it to the Intel MEI. + For more information: + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "About Intel AMT" => "Local Access" + + For downloading Intel LMS: + http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ + + The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS + firmware feature using a defined UUID and then communicates with the feature + using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). + The protocol is used to maintain multiple sessions with Intel AMT from a + single application. + + See the protocol specification in the Intel AMT Software Development Kit (SDK) + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" + => "Information for Intel(R) vPro(TM) Gateway Developers" + => "Description of the Intel AMT Port Forwarding (APF) Protocol" + + 2) Intel AMT Remote configuration using a Local Agent + + A Local Agent enables IT personnel to configure Intel AMT out-of-the-box + without requiring installing additional data to enable setup. The remote + configuration process may involve an ISV-developed remote configuration + agent that runs on the host. + For more information: + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "Setup and Configuration of Intel AMT" => + "SDK Tools Supporting Setup and Configuration" => + "Using the Local Agent Sample" + + An open source Intel AMT configuration utility, implementing a local agent + that accesses the Intel MEI driver, can be found here: + http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ + + +Intel AMT OS Health Watchdog +============================ + +The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog. +Whenever the OS hangs or crashes, Intel AMT will send an event +to any subscriber to this event. This mechanism means that +IT knows when a platform crashes even when there is a hard failure on the host. + +The Intel AMT Watchdog is composed of two parts: + 1) Firmware feature - receives the heartbeats + and sends an event when the heartbeats stop. + 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, + configures the watchdog and sends the heartbeats. + +The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure +the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the +watchdog is 120 seconds. + +If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate +on the me client bus and watchdog devices won't be exposed. + +Supported Chipsets +================== +82X38/X48 Express and newer + + +--- +linux-mei@linux.intel.com diff --git a/Documentation/misc-devices/mei/mei-client-bus.txt b/Documentation/misc-devices/mei/mei-client-bus.txt deleted file mode 100644 index 743be4ec8989..000000000000 --- a/Documentation/misc-devices/mei/mei-client-bus.txt +++ /dev/null @@ -1,141 +0,0 @@ -Intel(R) Management Engine (ME) Client bus API -============================================== - - -Rationale -========= - -MEI misc character device is useful for dedicated applications to send and receive -data to the many FW appliance found in Intel's ME from the user space. -However for some of the ME functionalities it make sense to leverage existing software -stack and expose them through existing kernel subsystems. - -In order to plug seamlessly into the kernel device driver model we add kernel virtual -bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers -for the various MEI features as a stand alone entities found in their respective subsystem. -Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to -the existing code. - - -MEI CL bus API -============== - -A driver implementation for an MEI Client is very similar to existing bus -based device drivers. The driver registers itself as an MEI CL bus driver through -the mei_cl_driver structure: - -struct mei_cl_driver { - struct device_driver driver; - const char *name; - - const struct mei_cl_device_id *id_table; - - int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id); - int (*remove)(struct mei_cl_device *dev); -}; - -struct mei_cl_id { - char name[MEI_NAME_SIZE]; - kernel_ulong_t driver_info; -}; - -The mei_cl_id structure allows the driver to bind itself against a device name. - -To actually register a driver on the ME Client bus one must call the mei_cl_add_driver() -API. This is typically called at module init time. - -Once registered on the ME Client bus, a driver will typically try to do some I/O on -this bus and this should be done through the mei_cl_send() and mei_cl_recv() -routines. The latter is synchronous (blocks and sleeps until data shows up). -In order for drivers to be notified of pending events waiting for them (e.g. -an Rx event) they can register an event handler through the -mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event -will trigger an event handler call and the driver implementation is supposed -to call mei_recv() from the event handler in order to fetch the pending -received buffers. - - -Example -======= - -As a theoretical example let's pretend the ME comes with a "contact" NFC IP. -The driver init and exit routines for this device would look like: - -#define CONTACT_DRIVER_NAME "contact" - -static struct mei_cl_device_id contact_mei_cl_tbl[] = { - { CONTACT_DRIVER_NAME, }, - - /* required last entry */ - { } -}; -MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl); - -static struct mei_cl_driver contact_driver = { - .id_table = contact_mei_tbl, - .name = CONTACT_DRIVER_NAME, - - .probe = contact_probe, - .remove = contact_remove, -}; - -static int contact_init(void) -{ - int r; - - r = mei_cl_driver_register(&contact_driver); - if (r) { - pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n"); - return r; - } - - return 0; -} - -static void __exit contact_exit(void) -{ - mei_cl_driver_unregister(&contact_driver); -} - -module_init(contact_init); -module_exit(contact_exit); - -And the driver's simplified probe routine would look like that: - -int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id) -{ - struct contact_driver *contact; - - [...] - mei_cl_enable_device(dev); - - mei_cl_register_event_cb(dev, contact_event_cb, contact); - - return 0; -} - -In the probe routine the driver first enable the MEI device and then registers -an ME bus event handler which is as close as it can get to registering a -threaded IRQ handler. -The handler implementation will typically call some I/O routine depending on -the pending events: - -#define MAX_NFC_PAYLOAD 128 - -static void contact_event_cb(struct mei_cl_device *dev, u32 events, - void *context) -{ - struct contact_driver *contact = context; - - if (events & BIT(MEI_EVENT_RX)) { - u8 payload[MAX_NFC_PAYLOAD]; - int payload_size; - - payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD); - if (payload_size <= 0) - return; - - /* Hook to the NFC subsystem */ - nfc_hci_recv_frame(contact->hdev, payload, payload_size); - } -} diff --git a/Documentation/misc-devices/mei/mei.txt b/Documentation/misc-devices/mei/mei.txt deleted file mode 100644 index 2b80a0cd621f..000000000000 --- a/Documentation/misc-devices/mei/mei.txt +++ /dev/null @@ -1,266 +0,0 @@ -Intel(R) Management Engine Interface (Intel(R) MEI) -=================================================== - -Introduction -============ - -The Intel Management Engine (Intel ME) is an isolated and protected computing -resource (Co-processor) residing inside certain Intel chipsets. The Intel ME -provides support for computer/IT management features. The feature set -depends on the Intel chipset SKU. - -The Intel Management Engine Interface (Intel MEI, previously known as HECI) -is the interface between the Host and Intel ME. This interface is exposed -to the host as a PCI device. The Intel MEI Driver is in charge of the -communication channel between a host application and the Intel ME feature. - -Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and -each client has its own protocol. The protocol is message-based with a -header and payload up to 512 bytes. - -Prominent usage of the Intel ME Interface is to communicate with Intel(R) -Active Management Technology (Intel AMT) implemented in firmware running on -the Intel ME. - -Intel AMT provides the ability to manage a host remotely out-of-band (OOB) -even when the operating system running on the host processor has crashed or -is in a sleep state. - -Some examples of Intel AMT usage are: - - Monitoring hardware state and platform components - - Remote power off/on (useful for green computing or overnight IT - maintenance) - - OS updates - - Storage of useful platform information such as software assets - - Built-in hardware KVM - - Selective network isolation of Ethernet and IP protocol flows based - on policies set by a remote management console - - IDE device redirection from remote management console - -Intel AMT (OOB) communication is based on SOAP (deprecated -starting with Release 6.0) over HTTP/S or WS-Management protocol over -HTTP/S that are received from a remote management console application. - -For more information about Intel AMT: -http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - - -Intel MEI Driver -================ - -The driver exposes a misc device called /dev/mei. - -An application maintains communication with an Intel ME feature while -/dev/mei is open. The binding to a specific feature is performed by calling -MEI_CONNECT_CLIENT_IOCTL, which passes the desired UUID. -The number of instances of an Intel ME feature that can be opened -at the same time depends on the Intel ME feature, but most of the -features allow only a single instance. - -The Intel AMT Host Interface (Intel AMTHI) feature supports multiple -simultaneous user connected applications. The Intel MEI driver -handles this internally by maintaining request queues for the applications. - -The driver is transparent to data that are passed between firmware feature -and host application. - -Because some of the Intel ME features can change the system -configuration, the driver by default allows only a privileged -user to access it. - -A code snippet for an application communicating with Intel AMTHI client: - - struct mei_connect_client_data data; - fd = open(MEI_DEVICE); - - data.d.in_client_uuid = AMTHI_UUID; - - ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data); - - printf("Ver=%d, MaxLen=%ld\n", - data.d.in_client_uuid.protocol_version, - data.d.in_client_uuid.max_msg_length); - - [...] - - write(fd, amthi_req_data, amthi_req_data_len); - - [...] - - read(fd, &amthi_res_data, amthi_res_data_len); - - [...] - close(fd); - - -IOCTL -===== - -The Intel MEI Driver supports the following IOCTL commands: - IOCTL_MEI_CONNECT_CLIENT Connect to firmware Feature (client). - - usage: - struct mei_connect_client_data clientData; - ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &clientData); - - inputs: - mei_connect_client_data struct contain the following - input field: - - in_client_uuid - UUID of the FW Feature that needs - to connect to. - outputs: - out_client_properties - Client Properties: MTU and Protocol Version. - - error returns: - EINVAL Wrong IOCTL Number - ENODEV Device or Connection is not initialized or ready. - (e.g. Wrong UUID) - ENOMEM Unable to allocate memory to client internal data. - EFAULT Fatal Error (e.g. Unable to access user input data) - EBUSY Connection Already Open - - Notes: - max_msg_length (MTU) in client properties describes the maximum - data that can be sent or received. (e.g. if MTU=2K, can send - requests up to bytes 2k and received responses up to 2k bytes). - - IOCTL_MEI_NOTIFY_SET: enable or disable event notifications - - Usage: - uint32_t enable; - ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable); - - Inputs: - uint32_t enable = 1; - or - uint32_t enable[disable] = 0; - - Error returns: - EINVAL Wrong IOCTL Number - ENODEV Device is not initialized or the client not connected - ENOMEM Unable to allocate memory to client internal data. - EFAULT Fatal Error (e.g. Unable to access user input data) - EOPNOTSUPP if the device doesn't support the feature - - Notes: - The client must be connected in order to enable notification events - - - IOCTL_MEI_NOTIFY_GET : retrieve event - - Usage: - uint32_t event; - ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event); - - Outputs: - 1 - if an event is pending - 0 - if there is no even pending - - Error returns: - EINVAL Wrong IOCTL Number - ENODEV Device is not initialized or the client not connected - ENOMEM Unable to allocate memory to client internal data. - EFAULT Fatal Error (e.g. Unable to access user input data) - EOPNOTSUPP if the device doesn't support the feature - - Notes: - The client must be connected and event notification has to be enabled - in order to receive an event - - -Intel ME Applications -===================== - - 1) Intel Local Management Service (Intel LMS) - - Applications running locally on the platform communicate with Intel AMT Release - 2.0 and later releases in the same way that network applications do via SOAP - over HTTP (deprecated starting with Release 6.0) or with WS-Management over - SOAP over HTTP. This means that some Intel AMT features can be accessed from a - local application using the same network interface as a remote application - communicating with Intel AMT over the network. - - When a local application sends a message addressed to the local Intel AMT host - name, the Intel LMS, which listens for traffic directed to the host name, - intercepts the message and routes it to the Intel MEI. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "About Intel AMT" => "Local Access" - - For downloading Intel LMS: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - - The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS - firmware feature using a defined UUID and then communicates with the feature - using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). - The protocol is used to maintain multiple sessions with Intel AMT from a - single application. - - See the protocol specification in the Intel AMT Software Development Kit (SDK) - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" - => "Information for Intel(R) vPro(TM) Gateway Developers" - => "Description of the Intel AMT Port Forwarding (APF) Protocol" - - 2) Intel AMT Remote configuration using a Local Agent - - A Local Agent enables IT personnel to configure Intel AMT out-of-the-box - without requiring installing additional data to enable setup. The remote - configuration process may involve an ISV-developed remote configuration - agent that runs on the host. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "Setup and Configuration of Intel AMT" => - "SDK Tools Supporting Setup and Configuration" => - "Using the Local Agent Sample" - - An open source Intel AMT configuration utility, implementing a local agent - that accesses the Intel MEI driver, can be found here: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - - -Intel AMT OS Health Watchdog -============================ - -The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog. -Whenever the OS hangs or crashes, Intel AMT will send an event -to any subscriber to this event. This mechanism means that -IT knows when a platform crashes even when there is a hard failure on the host. - -The Intel AMT Watchdog is composed of two parts: - 1) Firmware feature - receives the heartbeats - and sends an event when the heartbeats stop. - 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, - configures the watchdog and sends the heartbeats. - -The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure -the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the -watchdog is 120 seconds. - -If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate -on the me client bus and watchdog devices won't be exposed. - - -Supported Chipsets -================== - -7 Series Chipset Family -6 Series Chipset Family -5 Series Chipset Family -4 Series Chipset Family -Mobile 4 Series Chipset Family -ICH9 -82946GZ/GL -82G35 Express -82Q963/Q965 -82P965/G965 -Mobile PM965/GM965 -Mobile GME965/GLE960 -82Q35 Express -82G33/G31/P35/P31 Express -82Q33 Express -82X38/X48 Express - ---- -linux-mei@linux.intel.com -- cgit From 815d0f26c104873eb829e24510383d4d098417dd Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:01 +0300 Subject: mei: docs: move iamt docs to a iamt.rst file Move intel amt documentation to a seprate file. Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/iamt.rst | 106 +++++++++++++++++++++++++++++++++ Documentation/driver-api/mei/index.rst | 1 + Documentation/driver-api/mei/mei.rst | 100 ------------------------------- 3 files changed, 107 insertions(+), 100 deletions(-) create mode 100644 Documentation/driver-api/mei/iamt.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/iamt.rst b/Documentation/driver-api/mei/iamt.rst new file mode 100644 index 000000000000..6dcf5b16e958 --- /dev/null +++ b/Documentation/driver-api/mei/iamt.rst @@ -0,0 +1,106 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Intel(R) Active Management Technology (Intel AMT) +================================================= + +Prominent usage of the Intel ME Interface is to communicate with Intel(R) +Active Management Technology (Intel AMT) implemented in firmware running on +the Intel ME. + +Intel AMT provides the ability to manage a host remotely out-of-band (OOB) +even when the operating system running on the host processor has crashed or +is in a sleep state. + +Some examples of Intel AMT usage are: + - Monitoring hardware state and platform components + - Remote power off/on (useful for green computing or overnight IT + maintenance) + - OS updates + - Storage of useful platform information such as software assets + - Built-in hardware KVM + - Selective network isolation of Ethernet and IP protocol flows based + on policies set by a remote management console + - IDE device redirection from remote management console + +Intel AMT (OOB) communication is based on SOAP (deprecated +starting with Release 6.0) over HTTP/S or WS-Management protocol over +HTTP/S that are received from a remote management console application. + +For more information about Intel AMT: +http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + + +Intel AMT Applications +====================== + + 1) Intel Local Management Service (Intel LMS) + + Applications running locally on the platform communicate with Intel AMT Release + 2.0 and later releases in the same way that network applications do via SOAP + over HTTP (deprecated starting with Release 6.0) or with WS-Management over + SOAP over HTTP. This means that some Intel AMT features can be accessed from a + local application using the same network interface as a remote application + communicating with Intel AMT over the network. + + When a local application sends a message addressed to the local Intel AMT host + name, the Intel LMS, which listens for traffic directed to the host name, + intercepts the message and routes it to the Intel MEI. + For more information: + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "About Intel AMT" => "Local Access" + + For downloading Intel LMS: + http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ + + The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS + firmware feature using a defined UUID and then communicates with the feature + using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). + The protocol is used to maintain multiple sessions with Intel AMT from a + single application. + + See the protocol specification in the Intel AMT Software Development Kit (SDK) + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" + => "Information for Intel(R) vPro(TM) Gateway Developers" + => "Description of the Intel AMT Port Forwarding (APF) Protocol" + + 2) Intel AMT Remote configuration using a Local Agent + + A Local Agent enables IT personnel to configure Intel AMT out-of-the-box + without requiring installing additional data to enable setup. The remote + configuration process may involve an ISV-developed remote configuration + agent that runs on the host. + For more information: + http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide + Under "Setup and Configuration of Intel AMT" => + "SDK Tools Supporting Setup and Configuration" => + "Using the Local Agent Sample" + + An open source Intel AMT configuration utility, implementing a local agent + that accesses the Intel MEI driver, can be found here: + http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ + + +Intel AMT OS Health Watchdog +============================ + +The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog. +Whenever the OS hangs or crashes, Intel AMT will send an event +to any subscriber to this event. This mechanism means that +IT knows when a platform crashes even when there is a hard failure on the host. + +The Intel AMT Watchdog is composed of two parts: + 1) Firmware feature - receives the heartbeats + and sends an event when the heartbeats stop. + 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, + configures the watchdog and sends the heartbeats. + +The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure +the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the +watchdog is 120 seconds. + +If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate +on the me client bus and watchdog devices won't be exposed. + +--- +linux-mei@linux.intel.com diff --git a/Documentation/driver-api/mei/index.rst b/Documentation/driver-api/mei/index.rst index 35c1117d8366..d261afac6852 100644 --- a/Documentation/driver-api/mei/index.rst +++ b/Documentation/driver-api/mei/index.rst @@ -20,3 +20,4 @@ Intel(R) Management Engine Interface (Intel(R) MEI) mei mei-client-bus + iamt diff --git a/Documentation/driver-api/mei/mei.rst b/Documentation/driver-api/mei/mei.rst index 5aa3a5e6496a..c7f10a4b46ff 100644 --- a/Documentation/driver-api/mei/mei.rst +++ b/Documentation/driver-api/mei/mei.rst @@ -17,33 +17,6 @@ Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and each client has its own protocol. The protocol is message-based with a header and payload up to 512 bytes. -Prominent usage of the Intel ME Interface is to communicate with Intel(R) -Active Management Technology (Intel AMT) implemented in firmware running on -the Intel ME. - -Intel AMT provides the ability to manage a host remotely out-of-band (OOB) -even when the operating system running on the host processor has crashed or -is in a sleep state. - -Some examples of Intel AMT usage are: - - Monitoring hardware state and platform components - - Remote power off/on (useful for green computing or overnight IT - maintenance) - - OS updates - - Storage of useful platform information such as software assets - - Built-in hardware KVM - - Selective network isolation of Ethernet and IP protocol flows based - on policies set by a remote management console - - IDE device redirection from remote management console - -Intel AMT (OOB) communication is based on SOAP (deprecated -starting with Release 6.0) over HTTP/S or WS-Management protocol over -HTTP/S that are received from a remote management console application. - -For more information about Intel AMT: -http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - - Intel MEI Driver ================ @@ -169,82 +142,9 @@ The Intel MEI Driver supports the following IOCTL commands: in order to receive an event -Intel ME Applications -===================== - - 1) Intel Local Management Service (Intel LMS) - - Applications running locally on the platform communicate with Intel AMT Release - 2.0 and later releases in the same way that network applications do via SOAP - over HTTP (deprecated starting with Release 6.0) or with WS-Management over - SOAP over HTTP. This means that some Intel AMT features can be accessed from a - local application using the same network interface as a remote application - communicating with Intel AMT over the network. - - When a local application sends a message addressed to the local Intel AMT host - name, the Intel LMS, which listens for traffic directed to the host name, - intercepts the message and routes it to the Intel MEI. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "About Intel AMT" => "Local Access" - - For downloading Intel LMS: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - - The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS - firmware feature using a defined UUID and then communicates with the feature - using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). - The protocol is used to maintain multiple sessions with Intel AMT from a - single application. - - See the protocol specification in the Intel AMT Software Development Kit (SDK) - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" - => "Information for Intel(R) vPro(TM) Gateway Developers" - => "Description of the Intel AMT Port Forwarding (APF) Protocol" - - 2) Intel AMT Remote configuration using a Local Agent - - A Local Agent enables IT personnel to configure Intel AMT out-of-the-box - without requiring installing additional data to enable setup. The remote - configuration process may involve an ISV-developed remote configuration - agent that runs on the host. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "Setup and Configuration of Intel AMT" => - "SDK Tools Supporting Setup and Configuration" => - "Using the Local Agent Sample" - - An open source Intel AMT configuration utility, implementing a local agent - that accesses the Intel MEI driver, can be found here: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - - -Intel AMT OS Health Watchdog -============================ - -The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog. -Whenever the OS hangs or crashes, Intel AMT will send an event -to any subscriber to this event. This mechanism means that -IT knows when a platform crashes even when there is a hard failure on the host. - -The Intel AMT Watchdog is composed of two parts: - 1) Firmware feature - receives the heartbeats - and sends an event when the heartbeats stop. - 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, - configures the watchdog and sends the heartbeats. - -The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure -the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the -watchdog is 120 seconds. - -If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate -on the me client bus and watchdog devices won't be exposed. Supported Chipsets ================== 82X38/X48 Express and newer - ---- linux-mei@linux.intel.com -- cgit From 6080e0cff2bf7108d3f2855a7177b1f7f1830035 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:03 +0300 Subject: mei: docs: update mei client bus documentation. The mei client bus API has changed significantly from time it was documented, and had required update. Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/mei-client-bus.rst | 162 +++++++++++++----------- 1 file changed, 85 insertions(+), 77 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst index a26a85453bdf..7310dd45c484 100644 --- a/Documentation/driver-api/mei/mei-client-bus.rst +++ b/Documentation/driver-api/mei/mei-client-bus.rst @@ -8,13 +8,13 @@ Intel(R) Management Engine (ME) Client bus API Rationale ========= -MEI misc character device is useful for dedicated applications to send and receive +The MEI character device is useful for dedicated applications to send and receive data to the many FW appliance found in Intel's ME from the user space. -However for some of the ME functionalities it make sense to leverage existing software +However, for some of the ME functionalities it makes sense to leverage existing software stack and expose them through existing kernel subsystems. In order to plug seamlessly into the kernel device driver model we add kernel virtual -bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers +bus abstraction on top of the MEI driver. This allows implementing Linux kernel drivers for the various MEI features as a stand alone entities found in their respective subsystem. Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to the existing code. @@ -23,9 +23,9 @@ the existing code. MEI CL bus API ============== -A driver implementation for an MEI Client is very similar to existing bus +A driver implementation for an MEI Client is very similar to any other existing bus based device drivers. The driver registers itself as an MEI CL bus driver through -the ``struct mei_cl_driver`` structure: +the ``struct mei_cl_driver`` structure defined in :file:`include/linux/mei_cl_bus.c` .. code-block:: C @@ -39,25 +39,38 @@ the ``struct mei_cl_driver`` structure: int (*remove)(struct mei_cl_device *dev); }; - struct mei_cl_id { - char name[MEI_NAME_SIZE]; + + +The mei_cl_device_id structure defined in :file:`include/linux/mod_devicetable.h` allows a +driver to bind itself against a device name. + +.. code-block:: C + + struct mei_cl_device_id { + char name[MEI_CL_NAME_SIZE]; + uuid_le uuid; + __u8 version; kernel_ulong_t driver_info; }; -The mei_cl_id structure allows the driver to bind itself against a device name. +To actually register a driver on the ME Client bus one must call the :c:func:`mei_cl_add_driver` +API. This is typically called at module initialization time. + +Once the driver is registered and bound to the device, a driver will typically +try to do some I/O on this bus and this should be done through the :c:func:`mei_cl_send` +and :c:func:`mei_cl_recv` functions. More detailed information is in :ref:`api` section. + +In order for a driver to be notified about pending traffic or event, the driver +should register a callback via :c:func:`mei_cl_devev_register_rx_cb` and +:c:func:`mei_cldev_register_notify_cb` function respectively. -To actually register a driver on the ME Client bus one must call the mei_cl_add_driver() -API. This is typically called at module init time. +.. _api: + +API: +---- +.. kernel-doc:: drivers/misc/mei/bus.c + :export: drivers/misc/mei/bus.c -Once registered on the ME Client bus, a driver will typically try to do some I/O on -this bus and this should be done through the mei_cl_send() and mei_cl_recv() -routines. The latter is synchronous (blocks and sleeps until data shows up). -In order for drivers to be notified of pending events waiting for them (e.g. -an Rx event) they can register an event handler through the -mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event -will trigger an event handler call and the driver implementation is supposed -to call mei_recv() from the event handler in order to fetch the pending -received buffers. Example @@ -68,85 +81,80 @@ The driver init and exit routines for this device would look like: .. code-block:: C - #define CONTACT_DRIVER_NAME "contact" + #define CONTACT_DRIVER_NAME "contact" - static struct mei_cl_device_id contact_mei_cl_tbl[] = { - { CONTACT_DRIVER_NAME, }, + static struct mei_cl_device_id contact_mei_cl_tbl[] = { + { CONTACT_DRIVER_NAME, }, - /* required last entry */ - { } - }; - MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl); + /* required last entry */ + { } + }; + MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl); - static struct mei_cl_driver contact_driver = { - .id_table = contact_mei_tbl, - .name = CONTACT_DRIVER_NAME, + static struct mei_cl_driver contact_driver = { + .id_table = contact_mei_tbl, + .name = CONTACT_DRIVER_NAME, - .probe = contact_probe, - .remove = contact_remove, - }; + .probe = contact_probe, + .remove = contact_remove, + }; - static int contact_init(void) - { - int r; + static int contact_init(void) + { + int r; - r = mei_cl_driver_register(&contact_driver); - if (r) { - pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n"); - return r; - } + r = mei_cl_driver_register(&contact_driver); + if (r) { + pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n"); + return r; + } - return 0; - } + return 0; + } - static void __exit contact_exit(void) - { - mei_cl_driver_unregister(&contact_driver); - } + static void __exit contact_exit(void) + { + mei_cl_driver_unregister(&contact_driver); + } - module_init(contact_init); - module_exit(contact_exit); + module_init(contact_init); + module_exit(contact_exit); And the driver's simplified probe routine would look like that: .. code-block:: C - int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id) - { - struct contact_driver *contact; + int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id) + { + [...] + mei_cldev_enable(dev); - [...] - mei_cl_enable_device(dev); + mei_cldev_register_rx_cb(dev, contact_rx_cb); - mei_cl_register_event_cb(dev, contact_event_cb, contact); - - return 0; - } + return 0; + } In the probe routine the driver first enable the MEI device and then registers -an ME bus event handler which is as close as it can get to registering a -threaded IRQ handler. -The handler implementation will typically call some I/O routine depending on -the pending events: - -#define MAX_NFC_PAYLOAD 128 +an rx handler which is as close as it can get to registering a threaded IRQ handler. +The handler implementation will typically call :c:func:`mei_cldev_recv` and then +process received data. .. code-block:: C - static void contact_event_cb(struct mei_cl_device *dev, u32 events, - void *context) - { - struct contact_driver *contact = context; + #define MAX_PAYLOAD 128 + #define HDR_SIZE 4 + static void conntact_rx_cb(struct mei_cl_device *cldev) + { + struct contact *c = mei_cldev_get_drvdata(cldev); + unsigned char payload[MAX_PAYLOAD]; + ssize_t payload_sz; + + payload_sz = mei_cldev_recv(cldev, payload, MAX_PAYLOAD) + if (reply_size < HDR_SIZE) { + return; + } - if (events & BIT(MEI_EVENT_RX)) { - u8 payload[MAX_NFC_PAYLOAD]; - int payload_size; + c->process_rx(payload); - payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD); - if (payload_size <= 0) - return; + } - /* Hook to the NFC subsystem */ - nfc_hci_recv_frame(contact->hdev, payload, payload_size); - } - } -- cgit From 4e3d3b784ae7cd86ace2776c01be99ddfd378801 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:04 +0300 Subject: mei: docs: add a short description for nfc behind mei Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/index.rst | 2 +- Documentation/driver-api/mei/mei-client-bus.rst | 7 +++++++ Documentation/driver-api/mei/nfc.rst | 28 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 Documentation/driver-api/mei/nfc.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/index.rst b/Documentation/driver-api/mei/index.rst index d261afac6852..3a22b522ee78 100644 --- a/Documentation/driver-api/mei/index.rst +++ b/Documentation/driver-api/mei/index.rst @@ -16,7 +16,7 @@ Intel(R) Management Engine Interface (Intel(R) MEI) Table of Contents .. toctree:: - :maxdepth: 2 + :maxdepth: 3 mei mei-client-bus diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst index 7310dd45c484..bfe28ebc3ca8 100644 --- a/Documentation/driver-api/mei/mei-client-bus.rst +++ b/Documentation/driver-api/mei/mei-client-bus.rst @@ -158,3 +158,10 @@ process received data. } +MEI Client Bus Drivers +====================== + +.. toctree:: + :maxdepth: 2 + + nfc diff --git a/Documentation/driver-api/mei/nfc.rst b/Documentation/driver-api/mei/nfc.rst new file mode 100644 index 000000000000..b5b6fc96f85e --- /dev/null +++ b/Documentation/driver-api/mei/nfc.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +MEI NFC +------- + +Some Intel 8 and 9 Serieses chipsets supports NFC devices connected behind +the Intel Management Engine controller. +MEI client bus exposes the NFC chips as NFC phy devices and enables +binding with Microread and NXP PN544 NFC device driver from the Linux NFC +subsystem. + +.. kernel-render:: DOT + :alt: MEI NFC digraph + :caption: **MEI NFC** Stack + + digraph NFC { + cl_nfc -> me_cl_nfc; + "drivers/nfc/mei_phy" -> cl_nfc [lhead=bus]; + "drivers/nfc/microread/mei" -> cl_nfc; + "drivers/nfc/microread/mei" -> "drivers/nfc/mei_phy"; + "drivers/nfc/pn544/mei" -> cl_nfc; + "drivers/nfc/pn544/mei" -> "drivers/nfc/mei_phy"; + "net/nfc" -> "drivers/nfc/microread/mei"; + "net/nfc" -> "drivers/nfc/pn544/mei"; + "neard" -> "net/nfc"; + cl_nfc [label="mei/bus(nfc)"]; + me_cl_nfc [label="me fw (nfc)"]; + } -- cgit From 0475afd2a5dee99defdb7b030c09ba202ea3c64a Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:05 +0300 Subject: mei: docs: add hdcp documentation 1. Add a short ducumentation for MEI HDCP driver, and fix DOC comments in drivers/misc/mei/hdcp/mei_hdcp.c Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/hdcp.rst | 32 +++++++++++++++++++++++++ Documentation/driver-api/mei/mei-client-bus.rst | 1 + 2 files changed, 33 insertions(+) create mode 100644 Documentation/driver-api/mei/hdcp.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/hdcp.rst b/Documentation/driver-api/mei/hdcp.rst new file mode 100644 index 000000000000..e85a065b1cdc --- /dev/null +++ b/Documentation/driver-api/mei/hdcp.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +HDCP: +===== + +ME FW as a security engine provides the capability for setting up +HDCP2.2 protocol negotiation between the Intel graphics device and +an HDC2.2 sink. + +ME FW prepares HDCP2.2 negotiation parameters, signs and encrypts them +according the HDCP 2.2 spec. The Intel graphics sends the created blob +to the HDCP2.2 sink. + +Similarly, the HDCP2.2 sink's response is transferred to ME FW +for decryption and verification. + +Once all the steps of HDCP2.2 negotiation are completed, +upon request ME FW will configure the port as authenticated and supply +the HDCP encryption keys to Intel graphics hardware. + + +mei_hdcp driver +--------------- +.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c + :doc: MEI_HDCP Client Driver + +mei_hdcp api +------------ + +.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c + :functions: + diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst index bfe28ebc3ca8..f242b3f8d6aa 100644 --- a/Documentation/driver-api/mei/mei-client-bus.rst +++ b/Documentation/driver-api/mei/mei-client-bus.rst @@ -164,4 +164,5 @@ MEI Client Bus Drivers .. toctree:: :maxdepth: 2 + hdcp nfc -- cgit From 7e706da35a458f4b0d4c565c7b71023d8bfe279b Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 3 Jun 2019 12:14:06 +0300 Subject: mei: docs: fix broken links in iamt documentation. The iAMT documentation moved from http:// https://, and LMS is moved to github.com Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/iamt.rst | 105 ++++++++++++++++------------------ 1 file changed, 50 insertions(+), 55 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/iamt.rst b/Documentation/driver-api/mei/iamt.rst index 6dcf5b16e958..6ef3e613684b 100644 --- a/Documentation/driver-api/mei/iamt.rst +++ b/Documentation/driver-api/mei/iamt.rst @@ -27,62 +27,57 @@ starting with Release 6.0) over HTTP/S or WS-Management protocol over HTTP/S that are received from a remote management console application. For more information about Intel AMT: -http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide +https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm Intel AMT Applications -====================== - - 1) Intel Local Management Service (Intel LMS) - - Applications running locally on the platform communicate with Intel AMT Release - 2.0 and later releases in the same way that network applications do via SOAP - over HTTP (deprecated starting with Release 6.0) or with WS-Management over - SOAP over HTTP. This means that some Intel AMT features can be accessed from a - local application using the same network interface as a remote application - communicating with Intel AMT over the network. - - When a local application sends a message addressed to the local Intel AMT host - name, the Intel LMS, which listens for traffic directed to the host name, - intercepts the message and routes it to the Intel MEI. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "About Intel AMT" => "Local Access" - - For downloading Intel LMS: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - - The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS - firmware feature using a defined UUID and then communicates with the feature - using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). - The protocol is used to maintain multiple sessions with Intel AMT from a - single application. - - See the protocol specification in the Intel AMT Software Development Kit (SDK) - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" - => "Information for Intel(R) vPro(TM) Gateway Developers" - => "Description of the Intel AMT Port Forwarding (APF) Protocol" - - 2) Intel AMT Remote configuration using a Local Agent - - A Local Agent enables IT personnel to configure Intel AMT out-of-the-box - without requiring installing additional data to enable setup. The remote - configuration process may involve an ISV-developed remote configuration - agent that runs on the host. - For more information: - http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide - Under "Setup and Configuration of Intel AMT" => - "SDK Tools Supporting Setup and Configuration" => - "Using the Local Agent Sample" - - An open source Intel AMT configuration utility, implementing a local agent - that accesses the Intel MEI driver, can be found here: - http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/ - +---------------------- + + 1) Intel Local Management Service (Intel LMS) + + Applications running locally on the platform communicate with Intel AMT Release + 2.0 and later releases in the same way that network applications do via SOAP + over HTTP (deprecated starting with Release 6.0) or with WS-Management over + SOAP over HTTP. This means that some Intel AMT features can be accessed from a + local application using the same network interface as a remote application + communicating with Intel AMT over the network. + + When a local application sends a message addressed to the local Intel AMT host + name, the Intel LMS, which listens for traffic directed to the host name, + intercepts the message and routes it to the Intel MEI. + For more information: + https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm + Under "About Intel AMT" => "Local Access" + + For downloading Intel LMS: + https://github.com/intel/lms + + The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS + firmware feature using a defined GUID and then communicates with the feature + using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol). + The protocol is used to maintain multiple sessions with Intel AMT from a + single application. + + See the protocol specification in the Intel AMT Software Development Kit (SDK) + https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm + Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)" + => "Information for Intel(R) vPro(TM) Gateway Developers" + => "Description of the Intel AMT Port Forwarding (APF) Protocol" + + 2) Intel AMT Remote configuration using a Local Agent + + A Local Agent enables IT personnel to configure Intel AMT out-of-the-box + without requiring installing additional data to enable setup. The remote + configuration process may involve an ISV-developed remote configuration + agent that runs on the host. + For more information: + https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm + Under "Setup and Configuration of Intel AMT" => + "SDK Tools Supporting Setup and Configuration" => + "Using the Local Agent Sample" Intel AMT OS Health Watchdog -============================ +---------------------------- The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog. Whenever the OS hangs or crashes, Intel AMT will send an event @@ -90,10 +85,10 @@ to any subscriber to this event. This mechanism means that IT knows when a platform crashes even when there is a hard failure on the host. The Intel AMT Watchdog is composed of two parts: - 1) Firmware feature - receives the heartbeats - and sends an event when the heartbeats stop. - 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, - configures the watchdog and sends the heartbeats. + 1) Firmware feature - receives the heartbeats + and sends an event when the heartbeats stop. + 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature, + configures the watchdog and sends the heartbeats. The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the -- cgit From d0a178095c5fbbd25454c20e49bc3a7d70ecb769 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Thu, 6 Jun 2019 16:31:08 +0300 Subject: mei: docs: update mei documentation The mei driver went via multiple changes, update the documentation and fix formatting. Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/mei/mei.rst | 96 +++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 35 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/mei/mei.rst b/Documentation/driver-api/mei/mei.rst index c7f10a4b46ff..c800d8e5f422 100644 --- a/Documentation/driver-api/mei/mei.rst +++ b/Documentation/driver-api/mei/mei.rst @@ -5,34 +5,32 @@ Introduction The Intel Management Engine (Intel ME) is an isolated and protected computing resource (Co-processor) residing inside certain Intel chipsets. The Intel ME -provides support for computer/IT management features. The feature set -depends on the Intel chipset SKU. +provides support for computer/IT management and security features. +The actual feature set depends on the Intel chipset SKU. The Intel Management Engine Interface (Intel MEI, previously known as HECI) is the interface between the Host and Intel ME. This interface is exposed -to the host as a PCI device. The Intel MEI Driver is in charge of the -communication channel between a host application and the Intel ME feature. +to the host as a PCI device, actually multiple PCI devices might be exposed. +The Intel MEI Driver is in charge of the communication channel between +a host application and the Intel ME features. -Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and +Each Intel ME feature, or Intel ME Client is addressed by a unique GUID and each client has its own protocol. The protocol is message-based with a -header and payload up to 512 bytes. +header and payload up to maximal number of bytes advertised by the client, +upon connection. Intel MEI Driver ================ -The driver exposes a misc device called /dev/mei. +The driver exposes a character device with device nodes /dev/meiX. An application maintains communication with an Intel ME feature while -/dev/mei is open. The binding to a specific feature is performed by calling -MEI_CONNECT_CLIENT_IOCTL, which passes the desired UUID. +/dev/meiX is open. The binding to a specific feature is performed by calling +:c:macro:`MEI_CONNECT_CLIENT_IOCTL`, which passes the desired GUID. The number of instances of an Intel ME feature that can be opened at the same time depends on the Intel ME feature, but most of the features allow only a single instance. -The Intel AMT Host Interface (Intel AMTHI) feature supports multiple -simultaneous user connected applications. The Intel MEI driver -handles this internally by maintaining request queues for the applications. - The driver is transparent to data that are passed between firmware feature and host application. @@ -40,6 +38,8 @@ Because some of the Intel ME features can change the system configuration, the driver by default allows only a privileged user to access it. +The session is terminated calling :c:func:`close(int fd)`. + A code snippet for an application communicating with Intel AMTHI client: .. code-block:: C @@ -47,13 +47,13 @@ A code snippet for an application communicating with Intel AMTHI client: struct mei_connect_client_data data; fd = open(MEI_DEVICE); - data.d.in_client_uuid = AMTHI_UUID; + data.d.in_client_uuid = AMTHI_GUID; ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data); printf("Ver=%d, MaxLen=%ld\n", - data.d.in_client_uuid.protocol_version, - data.d.in_client_uuid.max_msg_length); + data.d.in_client_uuid.protocol_version, + data.d.in_client_uuid.max_msg_length); [...] @@ -67,60 +67,86 @@ A code snippet for an application communicating with Intel AMTHI client: close(fd); -IOCTLs -====== +User space API + +IOCTLs: +======= The Intel MEI Driver supports the following IOCTL commands: - IOCTL_MEI_CONNECT_CLIENT Connect to firmware Feature (client). - usage: - struct mei_connect_client_data clientData; - ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &clientData); +IOCTL_MEI_CONNECT_CLIENT +------------------------- +Connect to firmware Feature/Client. + +.. code-block:: none + + Usage: - inputs: - mei_connect_client_data struct contain the following - input field: + struct mei_connect_client_data client_data; - in_client_uuid - UUID of the FW Feature that needs + ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &client_data); + + Inputs: + + struct mei_connect_client_data - contain the following + Input field: + + in_client_uuid - GUID of the FW Feature that needs to connect to. - outputs: + Outputs: out_client_properties - Client Properties: MTU and Protocol Version. - error returns: + Error returns: + + ENOTTY No such client (i.e. wrong GUID) or connection is not allowed. EINVAL Wrong IOCTL Number - ENODEV Device or Connection is not initialized or ready. (e.g. Wrong UUID) + ENODEV Device or Connection is not initialized or ready. ENOMEM Unable to allocate memory to client internal data. EFAULT Fatal Error (e.g. Unable to access user input data) EBUSY Connection Already Open - Notes: +:Note: max_msg_length (MTU) in client properties describes the maximum data that can be sent or received. (e.g. if MTU=2K, can send requests up to bytes 2k and received responses up to 2k bytes). - IOCTL_MEI_NOTIFY_SET: enable or disable event notifications + +IOCTL_MEI_NOTIFY_SET +--------------------- +Enable or disable event notifications. + + +.. code-block:: none Usage: + uint32_t enable; + ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable); - Inputs: + uint32_t enable = 1; or uint32_t enable[disable] = 0; Error returns: + + EINVAL Wrong IOCTL Number ENODEV Device is not initialized or the client not connected ENOMEM Unable to allocate memory to client internal data. EFAULT Fatal Error (e.g. Unable to access user input data) EOPNOTSUPP if the device doesn't support the feature - Notes: +:Note: The client must be connected in order to enable notification events - IOCTL_MEI_NOTIFY_GET : retrieve event +IOCTL_MEI_NOTIFY_GET +-------------------- +Retrieve event + +.. code-block:: none Usage: uint32_t event; @@ -137,7 +163,7 @@ The Intel MEI Driver supports the following IOCTL commands: EFAULT Fatal Error (e.g. Unable to access user input data) EOPNOTSUPP if the device doesn't support the feature - Notes: +:Note: The client must be connected and event notification has to be enabled in order to receive an event -- cgit From c2a6ea23a40152a03d6c605d6f0ea239b11605b8 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 14 Jun 2019 15:32:18 +0100 Subject: dt-bindings: fsl: scu: add ocotp binding NXP i.MX8QXP is an ARMv8 SoC with a Cortex-M4 core inside as system controller(SCU), the ocotp controller is being controlled by the SCU, so Linux need use RPC to SCU for ocotp handling. This patch adds binding doc for i.MX8 SCU OCOTP driver. Cc: Mark Rutland Cc: Shawn Guo Cc: Ulf Hansson Cc: Stephen Boyd Cc: Anson Huang Cc: devicetree@vger.kernel.org Reviewed-by: Rob Herring Reviewed-by: Dong Aisheng Signed-off-by: Peng Fan Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/arm/freescale/fsl,scu.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt index 5d7dbabbb784..f378922906f6 100644 --- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt @@ -133,6 +133,18 @@ RTC bindings based on SCU Message Protocol Required properties: - compatible: should be "fsl,imx8qxp-sc-rtc"; +OCOTP bindings based on SCU Message Protocol +------------------------------------------------------------ +Required properties: +- compatible: Should be "fsl,imx8qxp-scu-ocotp" +- #address-cells: Must be 1. Contains byte index +- #size-cells: Must be 1. Contains byte length + +Optional Child nodes: + +- Data cells of ocotp: + Detailed bindings are described in bindings/nvmem/nvmem.txt + Example (imx8qxp): ------------- aliases { @@ -177,6 +189,16 @@ firmware { ... }; + ocotp: imx8qx-ocotp { + compatible = "fsl,imx8qxp-scu-ocotp"; + #address-cells = <1>; + #size-cells = <1>; + + fec_mac0: mac@2c4 { + reg = <0x2c4 8>; + }; + }; + pd: imx8qx-pd { compatible = "fsl,imx8qxp-scu-pd", "fsl,scu-pd"; #power-domain-cells = <1>; -- cgit From 42c3dcedc6b76b1383e1b63be82b84c058b5a8ad Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 14 Jun 2019 15:32:21 +0100 Subject: dt-bindings: nvmem: Convert Allwinner SID to a schema The Allwinner SoCs have an efuse supported in Linux, with a matching Device Tree binding. Now that we have the DT validation in place, let's convert the device tree bindings for that controller over to a YAML schemas. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- .../bindings/nvmem/allwinner,sun4i-a10-sid.yaml | 51 ++++++++++++++++++++++ .../bindings/nvmem/allwinner,sunxi-sid.txt | 29 ------------ 2 files changed, 51 insertions(+), 29 deletions(-) create mode 100644 Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml delete mode 100644 Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml new file mode 100644 index 000000000000..c9efd6e2c134 --- /dev/null +++ b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/nvmem/allwinner,sun4i-a10-sid.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 Security ID Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +allOf: + - $ref: "nvmem.yaml#" + +properties: + compatible: + enum: + - allwinner,sun4i-a10-sid + - allwinner,sun7i-a20-sid + - allwinner,sun8i-a83t-sid + - allwinner,sun8i-h3-sid + - allwinner,sun50i-a64-sid + - allwinner,sun50i-h5-sid + - allwinner,sun50i-h6-sid + + reg: + maxItems: 1 + +required: + - compatible + - reg + +# FIXME: We should set it, but it would report all the generic +# properties as additional properties. +# additionalProperties: false + +examples: + - | + sid@1c23800 { + compatible = "allwinner,sun4i-a10-sid"; + reg = <0x01c23800 0x10>; + }; + + - | + sid@1c23800 { + compatible = "allwinner,sun7i-a20-sid"; + reg = <0x01c23800 0x200>; + }; + +... diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt deleted file mode 100644 index cfb18b4ef8f7..000000000000 --- a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt +++ /dev/null @@ -1,29 +0,0 @@ -Allwinner sunxi-sid - -Required properties: -- compatible: Should be one of the following: - "allwinner,sun4i-a10-sid" - "allwinner,sun7i-a20-sid" - "allwinner,sun8i-a83t-sid" - "allwinner,sun8i-h3-sid" - "allwinner,sun50i-a64-sid" - "allwinner,sun50i-h5-sid" - "allwinner,sun50i-h6-sid" - -- reg: Should contain registers location and length - -= Data cells = -Are child nodes of sunxi-sid, bindings of which as described in -bindings/nvmem/nvmem.txt - -Example for sun4i: - sid@1c23800 { - compatible = "allwinner,sun4i-a10-sid"; - reg = <0x01c23800 0x10> - }; - -Example for sun7i: - sid@1c23800 { - compatible = "allwinner,sun7i-a20-sid"; - reg = <0x01c23800 0x200> - }; -- cgit From ae1c6b9aa34be021555e1234a271d56108caa82e Mon Sep 17 00:00:00 2001 From: Pankaj Bansal Date: Wed, 12 Jun 2019 08:52:56 +0000 Subject: dt-bindings: add register based devices' mux controller DT bindings This adds device tree binding documentation for generic register based multiplexer controlled by a bitfields in a parent device's register range. since MMIO mux is a special case of generic register based mux, the MMIO mux bindings have been subsumed in these bindings. Signed-off-by: Pankaj Bansal Signed-off-by: Peter Rosin Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/mux/mmio-mux.txt | 60 ---------- Documentation/devicetree/bindings/mux/reg-mux.txt | 129 +++++++++++++++++++++ 2 files changed, 129 insertions(+), 60 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mux/mmio-mux.txt create mode 100644 Documentation/devicetree/bindings/mux/reg-mux.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mux/mmio-mux.txt b/Documentation/devicetree/bindings/mux/mmio-mux.txt deleted file mode 100644 index a9bfb4d8b6ac..000000000000 --- a/Documentation/devicetree/bindings/mux/mmio-mux.txt +++ /dev/null @@ -1,60 +0,0 @@ -MMIO register bitfield-based multiplexer controller bindings - -Define register bitfields to be used to control multiplexers. The parent -device tree node must be a syscon node to provide register access. - -Required properties: -- compatible : "mmio-mux" -- #mux-control-cells : <1> -- mux-reg-masks : an array of register offset and pre-shifted bitfield mask - pairs, each describing a single mux control. -* Standard mux-controller bindings as decribed in mux-controller.txt - -Optional properties: -- idle-states : if present, the state the muxes will have when idle. The - special state MUX_IDLE_AS_IS is the default. - -The multiplexer state of each multiplexer is defined as the value of the -bitfield described by the corresponding register offset and bitfield mask pair -in the mux-reg-masks array, accessed through the parent syscon. - -Example: - - syscon { - compatible = "syscon"; - - mux: mux-controller { - compatible = "mmio-mux"; - #mux-control-cells = <1>; - - mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */ - <0x3 0x40>, /* 1: reg 0x3, bit 6 */ - idle-states = , <0>; - }; - }; - - video-mux { - compatible = "video-mux"; - mux-controls = <&mux 0>; - - ports { - /* inputs 0..3 */ - port@0 { - reg = <0>; - }; - port@1 { - reg = <1>; - }; - port@2 { - reg = <2>; - }; - port@3 { - reg = <3>; - }; - - /* output */ - port@4 { - reg = <4>; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/mux/reg-mux.txt b/Documentation/devicetree/bindings/mux/reg-mux.txt new file mode 100644 index 000000000000..4afd7ba73d60 --- /dev/null +++ b/Documentation/devicetree/bindings/mux/reg-mux.txt @@ -0,0 +1,129 @@ +Generic register bitfield-based multiplexer controller bindings + +Define register bitfields to be used to control multiplexers. The parent +device tree node must be a device node to provide register r/w access. + +Required properties: +- compatible : should be one of + "reg-mux" : if parent device of mux controller is not syscon device + "mmio-mux" : if parent device of mux controller is syscon device +- #mux-control-cells : <1> +- mux-reg-masks : an array of register offset and pre-shifted bitfield mask + pairs, each describing a single mux control. +* Standard mux-controller bindings as decribed in mux-controller.txt + +Optional properties: +- idle-states : if present, the state the muxes will have when idle. The + special state MUX_IDLE_AS_IS is the default. + +The multiplexer state of each multiplexer is defined as the value of the +bitfield described by the corresponding register offset and bitfield mask +pair in the mux-reg-masks array. + +Example 1: +The parent device of mux controller is not a syscon device. + +&i2c0 { + fpga@66 { // fpga connected to i2c + compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c", + "simple-mfd"; + reg = <0x66>; + + mux: mux-controller { + compatible = "reg-mux"; + #mux-control-cells = <1>; + mux-reg-masks = <0x54 0xf8>, /* 0: reg 0x54, bits 7:3 */ + <0x54 0x07>; /* 1: reg 0x54, bits 2:0 */ + }; + }; +}; + +mdio-mux-1 { + compatible = "mdio-mux-multiplexer"; + mux-controls = <&mux 0>; + mdio-parent-bus = <&emdio1>; + #address-cells = <1>; + #size-cells = <0>; + + mdio@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + + mdio@8 { + reg = <0x8>; + #address-cells = <1>; + #size-cells = <0>; + }; + + .. + .. +}; + +mdio-mux-2 { + compatible = "mdio-mux-multiplexer"; + mux-controls = <&mux 1>; + mdio-parent-bus = <&emdio2>; + #address-cells = <1>; + #size-cells = <0>; + + mdio@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + + mdio@1 { + reg = <0x1>; + #address-cells = <1>; + #size-cells = <0>; + }; + + .. + .. +}; + +Example 2: +The parent device of mux controller is syscon device. + +syscon { + compatible = "syscon"; + + mux: mux-controller { + compatible = "mmio-mux"; + #mux-control-cells = <1>; + + mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */ + <0x3 0x40>, /* 1: reg 0x3, bit 6 */ + idle-states = , <0>; + }; +}; + +video-mux { + compatible = "video-mux"; + mux-controls = <&mux 0>; + #address-cells = <1>; + #size-cells = <0>; + + ports { + /* inputs 0..3 */ + port@0 { + reg = <0>; + }; + port@1 { + reg = <1>; + }; + port@2 { + reg = <2>; + }; + port@3 { + reg = <3>; + }; + + /* output */ + port@4 { + reg = <4>; + }; + }; +}; -- cgit From 7a82a01e5c850986d4b53dab52ac71294285586f Mon Sep 17 00:00:00 2001 From: Dragan Cvetic Date: Tue, 11 Jun 2019 18:29:35 +0100 Subject: dt-bindings: xilinx-sdfec: Add SDFEC binding Add the Soft Decision Forward Error Correction (SDFEC) Engine bindings which is available for the Zynq UltraScale+ RFSoC FPGA's. Signed-off-by: Dragan Cvetic Signed-off-by: Derek Kiernan Reviewed-by: Rob Herring .../devicetree/bindings/misc/xlnx,sd-fec.txt | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/misc/xlnx,sd-fec.txt | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt new file mode 100644 index 000000000000..e3289634fa30 --- /dev/null +++ b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt @@ -0,0 +1,58 @@ +* Xilinx SDFEC(16nm) IP * + +The Soft Decision Forward Error Correction (SDFEC) Engine is a Hard IP block +which provides high-throughput LDPC and Turbo Code implementations. +The LDPC decode & encode functionality is capable of covering a range of +customer specified Quasi-cyclic (QC) codes. The Turbo decode functionality +principally covers codes used by LTE. The FEC Engine offers significant +power and area savings versus implementations done in the FPGA fabric. + + +Required properties: +- compatible: Must be "xlnx,sd-fec-1.1" +- clock-names : List of input clock names from the following: + - "core_clk", Main processing clock for processing core (required) + - "s_axi_aclk", AXI4-Lite memory-mapped slave interface clock (required) + - "s_axis_din_aclk", DIN AXI4-Stream Slave interface clock (optional) + - "s_axis_din_words-aclk", DIN_WORDS AXI4-Stream Slave interface clock (optional) + - "s_axis_ctrl_aclk", Control input AXI4-Stream Slave interface clock (optional) + - "m_axis_dout_aclk", DOUT AXI4-Stream Master interface clock (optional) + - "m_axis_dout_words_aclk", DOUT_WORDS AXI4-Stream Master interface clock (optional) + - "m_axis_status_aclk", Status output AXI4-Stream Master interface clock (optional) +- clocks : Clock phandles (see clock_bindings.txt for details). +- reg: Should contain Xilinx SDFEC 16nm Hardened IP block registers + location and length. +- xlnx,sdfec-code : Should contain "ldpc" or "turbo" to describe the codes + being used. +- xlnx,sdfec-din-words : A value 0 indicates that the DIN_WORDS interface is + driven with a fixed value and is not present on the device, a value of 1 + configures the DIN_WORDS to be block based, while a value of 2 configures the + DIN_WORDS input to be supplied for each AXI transaction. +- xlnx,sdfec-din-width : Configures the DIN AXI stream where a value of 1 + configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width + of "4x128b". +- xlnx,sdfec-dout-words : A value 0 indicates that the DOUT_WORDS interface is + driven with a fixed value and is not present on the device, a value of 1 + configures the DOUT_WORDS to be block based, while a value of 2 configures the + DOUT_WORDS input to be supplied for each AXI transaction. +- xlnx,sdfec-dout-width : Configures the DOUT AXI stream where a value of 1 + configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width + of "4x128b". +Optional properties: +- interrupts: should contain SDFEC interrupt number + +Example +--------------------------------------- + sd_fec_0: sd-fec@a0040000 { + compatible = "xlnx,sd-fec-1.1"; + clock-names = "core_clk","s_axi_aclk","s_axis_ctrl_aclk","s_axis_din_aclk","m_axis_status_aclk","m_axis_dout_aclk"; + clocks = <&misc_clk_2>,<&misc_clk_0>,<&misc_clk_1>,<&misc_clk_1>,<&misc_clk_1>, <&misc_clk_1>; + reg = <0x0 0xa0040000 0x0 0x40000>; + interrupt-parent = <&axi_intc>; + interrupts = <1 0>; + xlnx,sdfec-code = "ldpc"; + xlnx,sdfec-din-words = <0>; + xlnx,sdfec-din-width = <2>; + xlnx,sdfec-dout-words = <0>; + xlnx,sdfec-dout-width = <1>; + }; -- cgit From 22d137e283e63a62414464744302e01f28196ffc Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 4 Jun 2019 16:30:15 +0200 Subject: dt-bindings: memory: jz4780: Add compatible string for JZ4740 SoC Add a compatible string to support the memory controller built into the JZ4740 SoC from Ingenic. Signed-off-by: Paul Cercueil Reviewed-by: Boris Brezillon Reviewed-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt index f936b5589b19..59b8dcc118ee 100644 --- a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt +++ b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt @@ -5,6 +5,7 @@ controller in Ingenic JZ4780 Required properties: - compatible: Should be set to one of: + "ingenic,jz4740-nemc" (JZ4740) "ingenic,jz4780-nemc" (JZ4780) - reg: Should specify the NEMC controller registers location and length. - clocks: Clock for the NEMC controller. -- cgit From fd757dbac5f6c42f45b1a29a354255045a655a27 Mon Sep 17 00:00:00 2001 From: Tomasz Figa Date: Fri, 21 Jun 2019 13:13:51 +0200 Subject: dt-bindings: extcon: Add support for fsa9480 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds documentation for binding of extcont Fairchild Semiconductor FSA9480 microusb switch. This usb port accessory detector and switch, can be found for example in some Samsung s5pv210 based phones. Signed-off-by: Tomasz Figa Signed-off-by: Paweł Chmiel Acked-by: Chanwoo Choi Signed-off-by: Chanwoo Choi --- .../devicetree/bindings/extcon/extcon-fsa9480.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt new file mode 100644 index 000000000000..d592c21245f2 --- /dev/null +++ b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt @@ -0,0 +1,19 @@ +FAIRCHILD SEMICONDUCTOR FSA9480 MICROUSB SWITCH + +The FSA9480 is a USB port accessory detector and switch. The FSA9480 is fully +controlled using I2C and enables USB data, stereo and mono audio, video, +microphone, and UART data to use a common connector port. + +Required properties: + - compatible : Must be "fcs,fsa9480" + - reg : Specifies i2c slave address. Must be 0x25. + - interrupts : Should contain one entry specifying interrupt signal of + interrupt parent to which interrupt pin of the chip is connected. + + Example: + musb@25 { + compatible = "fcs,fsa9480"; + reg = <0x25>; + interrupt-parent = <&gph2>; + interrupts = <7 0>; + }; -- cgit From 25c7eabed5b219e975c8aee527ae202604d10911 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Wed, 26 Jun 2019 11:27:32 +0100 Subject: dt-bindings: imx-ocotp: Add i.MX8MM compatible Add compatible for i.MX8MM as per arch/arm64/boot/dts/freescale/imx8mm.dtsi Signed-off-by: Bryan O'Donoghue Cc: Rob Herring Reviewed-by: Leonard Crestez Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/imx-ocotp.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt index 68f7d6fdd140..96ffd06d2ca8 100644 --- a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt +++ b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt @@ -15,6 +15,7 @@ Required properties: "fsl,imx6sll-ocotp" (i.MX6SLL), "fsl,imx7ulp-ocotp" (i.MX7ULP), "fsl,imx8mq-ocotp" (i.MX8MQ), + "fsl,imx8mm-ocotp" (i.MX8MM), followed by "syscon". - #address-cells : Should be 1 - #size-cells : Should be 1 -- cgit From 9b5db89ea4bfdbb23d4f85f3a7fbf2cd36d20146 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 18:23:13 -0300 Subject: docs: misc-devices: convert files without extension to ReST Those files are also text files. Convert them to ReST and add to the misc-files index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/b7dc829809673bd8cffe0e7bbe9c9308681c6fe2.1561756511.git.mchehab+samsung@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/misc-devices/eeprom | 96 --------------------- Documentation/misc-devices/eeprom.rst | 107 +++++++++++++++++++++++ Documentation/misc-devices/ics932s401 | 31 ------- Documentation/misc-devices/ics932s401.rst | 36 ++++++++ Documentation/misc-devices/index.rst | 5 ++ Documentation/misc-devices/isl29003 | 62 -------------- Documentation/misc-devices/isl29003.rst | 75 ++++++++++++++++ Documentation/misc-devices/lis3lv02d | 93 -------------------- Documentation/misc-devices/lis3lv02d.rst | 99 ++++++++++++++++++++++ Documentation/misc-devices/max6875 | 110 ------------------------ Documentation/misc-devices/max6875.rst | 136 ++++++++++++++++++++++++++++++ 11 files changed, 458 insertions(+), 392 deletions(-) delete mode 100644 Documentation/misc-devices/eeprom create mode 100644 Documentation/misc-devices/eeprom.rst delete mode 100644 Documentation/misc-devices/ics932s401 create mode 100644 Documentation/misc-devices/ics932s401.rst delete mode 100644 Documentation/misc-devices/isl29003 create mode 100644 Documentation/misc-devices/isl29003.rst delete mode 100644 Documentation/misc-devices/lis3lv02d create mode 100644 Documentation/misc-devices/lis3lv02d.rst delete mode 100644 Documentation/misc-devices/max6875 create mode 100644 Documentation/misc-devices/max6875.rst (limited to 'Documentation') diff --git a/Documentation/misc-devices/eeprom b/Documentation/misc-devices/eeprom deleted file mode 100644 index ba692011f221..000000000000 --- a/Documentation/misc-devices/eeprom +++ /dev/null @@ -1,96 +0,0 @@ -Kernel driver eeprom -==================== - -Supported chips: - * Any EEPROM chip in the designated address range - Prefix: 'eeprom' - Addresses scanned: I2C 0x50 - 0x57 - Datasheets: Publicly available from: - Atmel (www.atmel.com), - Catalyst (www.catsemi.com), - Fairchild (www.fairchildsemi.com), - Microchip (www.microchip.com), - Philips (www.semiconductor.philips.com), - Rohm (www.rohm.com), - ST (www.st.com), - Xicor (www.xicor.com), - and others. - - Chip Size (bits) Address - 24C01 1K 0x50 (shadows at 0x51 - 0x57) - 24C01A 1K 0x50 - 0x57 (Typical device on DIMMs) - 24C02 2K 0x50 - 0x57 - 24C04 4K 0x50, 0x52, 0x54, 0x56 - (additional data at 0x51, 0x53, 0x55, 0x57) - 24C08 8K 0x50, 0x54 (additional data at 0x51, 0x52, - 0x53, 0x55, 0x56, 0x57) - 24C16 16K 0x50 (additional data at 0x51 - 0x57) - Sony 2K 0x57 - - Atmel 34C02B 2K 0x50 - 0x57, SW write protect at 0x30-37 - Catalyst 34FC02 2K 0x50 - 0x57, SW write protect at 0x30-37 - Catalyst 34RC02 2K 0x50 - 0x57, SW write protect at 0x30-37 - Fairchild 34W02 2K 0x50 - 0x57, SW write protect at 0x30-37 - Microchip 24AA52 2K 0x50 - 0x57, SW write protect at 0x30-37 - ST M34C02 2K 0x50 - 0x57, SW write protect at 0x30-37 - - -Authors: - Frodo Looijaard , - Philip Edelbrock , - Jean Delvare , - Greg Kroah-Hartman , - IBM Corp. - -Description ------------ - -This is a simple EEPROM module meant to enable reading the first 256 bytes -of an EEPROM (on a SDRAM DIMM for example). However, it will access serial -EEPROMs on any I2C adapter. The supported devices are generically called -24Cxx, and are listed above; however the numbering for these -industry-standard devices may vary by manufacturer. - -This module was a programming exercise to get used to the new project -organization laid out by Frodo, but it should be at least completely -effective for decoding the contents of EEPROMs on DIMMs. - -DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants. -The other devices will not be found on a DIMM because they respond to more -than one address. - -DDC Monitors may contain any device. Often a 24C01, which responds to all 8 -addresses, is found. - -Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the -specification, so it is guess work and far from being complete. - -The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional -software write protect register at 0x30 - 0x37 (0x20 less than the memory -location). The chip responds to "write quick" detection at this address but -does not respond to byte reads. If this register is present, the lower 128 -bytes of the memory array are not write protected. Any byte data write to -this address will write protect the memory array permanently, and the -device will no longer respond at the 0x30-37 address. The eeprom driver -does not support this register. - -Lacking functionality: - -* Full support for larger devices (24C04, 24C08, 24C16). These are not -typically found on a PC. These devices will appear as separate devices at -multiple addresses. - -* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512). -These devices require two-byte address fields and are not supported. - -* Enable Writing. Again, no technical reason why not, but making it easy -to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy -to disable the DIMMs (potentially preventing the computer from booting) -until the values are restored somehow. - -Use: - -After inserting the module (and any other required SMBus/i2c modules), you -should have some EEPROM directories in /sys/bus/i2c/devices/* of names such -as "0-0050". Inside each of these is a series of files, the eeprom file -contains the binary data from EEPROM. diff --git a/Documentation/misc-devices/eeprom.rst b/Documentation/misc-devices/eeprom.rst new file mode 100644 index 000000000000..008249675ccc --- /dev/null +++ b/Documentation/misc-devices/eeprom.rst @@ -0,0 +1,107 @@ +==================== +Kernel driver eeprom +==================== + +Supported chips: + + * Any EEPROM chip in the designated address range + + Prefix: 'eeprom' + + Addresses scanned: I2C 0x50 - 0x57 + + Datasheets: Publicly available from: + + Atmel (www.atmel.com), + Catalyst (www.catsemi.com), + Fairchild (www.fairchildsemi.com), + Microchip (www.microchip.com), + Philips (www.semiconductor.philips.com), + Rohm (www.rohm.com), + ST (www.st.com), + Xicor (www.xicor.com), + and others. + + ========= ============= ============================================ + Chip Size (bits) Address + ========= ============= ============================================ + 24C01 1K 0x50 (shadows at 0x51 - 0x57) + 24C01A 1K 0x50 - 0x57 (Typical device on DIMMs) + 24C02 2K 0x50 - 0x57 + 24C04 4K 0x50, 0x52, 0x54, 0x56 + (additional data at 0x51, 0x53, 0x55, 0x57) + 24C08 8K 0x50, 0x54 (additional data at 0x51, 0x52, + 0x53, 0x55, 0x56, 0x57) + 24C16 16K 0x50 (additional data at 0x51 - 0x57) + Sony 2K 0x57 + + Atmel 34C02B 2K 0x50 - 0x57, SW write protect at 0x30-37 + Catalyst 34FC02 2K 0x50 - 0x57, SW write protect at 0x30-37 + Catalyst 34RC02 2K 0x50 - 0x57, SW write protect at 0x30-37 + Fairchild 34W02 2K 0x50 - 0x57, SW write protect at 0x30-37 + Microchip 24AA52 2K 0x50 - 0x57, SW write protect at 0x30-37 + ST M34C02 2K 0x50 - 0x57, SW write protect at 0x30-37 + ========= ============= ============================================ + + +Authors: + - Frodo Looijaard , + - Philip Edelbrock , + - Jean Delvare , + - Greg Kroah-Hartman , + - IBM Corp. + +Description +----------- + +This is a simple EEPROM module meant to enable reading the first 256 bytes +of an EEPROM (on a SDRAM DIMM for example). However, it will access serial +EEPROMs on any I2C adapter. The supported devices are generically called +24Cxx, and are listed above; however the numbering for these +industry-standard devices may vary by manufacturer. + +This module was a programming exercise to get used to the new project +organization laid out by Frodo, but it should be at least completely +effective for decoding the contents of EEPROMs on DIMMs. + +DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants. +The other devices will not be found on a DIMM because they respond to more +than one address. + +DDC Monitors may contain any device. Often a 24C01, which responds to all 8 +addresses, is found. + +Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the +specification, so it is guess work and far from being complete. + +The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional +software write protect register at 0x30 - 0x37 (0x20 less than the memory +location). The chip responds to "write quick" detection at this address but +does not respond to byte reads. If this register is present, the lower 128 +bytes of the memory array are not write protected. Any byte data write to +this address will write protect the memory array permanently, and the +device will no longer respond at the 0x30-37 address. The eeprom driver +does not support this register. + +Lacking functionality +--------------------- + +* Full support for larger devices (24C04, 24C08, 24C16). These are not + typically found on a PC. These devices will appear as separate devices at + multiple addresses. + +* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512). + These devices require two-byte address fields and are not supported. + +* Enable Writing. Again, no technical reason why not, but making it easy + to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy + to disable the DIMMs (potentially preventing the computer from booting) + until the values are restored somehow. + +Use +--- + +After inserting the module (and any other required SMBus/i2c modules), you +should have some EEPROM directories in ``/sys/bus/i2c/devices/*`` of names such +as "0-0050". Inside each of these is a series of files, the eeprom file +contains the binary data from EEPROM. diff --git a/Documentation/misc-devices/ics932s401 b/Documentation/misc-devices/ics932s401 deleted file mode 100644 index bdac67ff6e3f..000000000000 --- a/Documentation/misc-devices/ics932s401 +++ /dev/null @@ -1,31 +0,0 @@ -Kernel driver ics932s401 -====================== - -Supported chips: - * IDT ICS932S401 - Prefix: 'ics932s401' - Addresses scanned: I2C 0x69 - Datasheet: Publicly available at the IDT website - -Author: Darrick J. Wong - -Description ------------ - -This driver implements support for the IDT ICS932S401 chip family. - -This chip has 4 clock outputs--a base clock for the CPU (which is likely -multiplied to get the real CPU clock), a system clock, a PCI clock, a USB -clock, and a reference clock. The driver reports selected and actual -frequency. If spread spectrum mode is enabled, the driver also reports by what -percent the clock signal is being spread, which should be between 0 and -0.5%. -All frequencies are reported in KHz. - -The ICS932S401 monitors all inputs continuously. The driver will not read -the registers more often than once every other second. - -Special Features ----------------- - -The clocks could be reprogrammed to increase system speed. I will not help you -do this, as you risk damaging your system! diff --git a/Documentation/misc-devices/ics932s401.rst b/Documentation/misc-devices/ics932s401.rst new file mode 100644 index 000000000000..613ee54a9c21 --- /dev/null +++ b/Documentation/misc-devices/ics932s401.rst @@ -0,0 +1,36 @@ +======================== +Kernel driver ics932s401 +======================== + +Supported chips: + + * IDT ICS932S401 + + Prefix: 'ics932s401' + + Addresses scanned: I2C 0x69 + + Datasheet: Publicly available at the IDT website + +Author: Darrick J. Wong + +Description +----------- + +This driver implements support for the IDT ICS932S401 chip family. + +This chip has 4 clock outputs--a base clock for the CPU (which is likely +multiplied to get the real CPU clock), a system clock, a PCI clock, a USB +clock, and a reference clock. The driver reports selected and actual +frequency. If spread spectrum mode is enabled, the driver also reports by what +percent the clock signal is being spread, which should be between 0 and -0.5%. +All frequencies are reported in KHz. + +The ICS932S401 monitors all inputs continuously. The driver will not read +the registers more often than once every other second. + +Special Features +---------------- + +The clocks could be reprogrammed to increase system speed. I will not help you +do this, as you risk damaging your system! diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst index dfd1f45a3127..a57f92dfe49a 100644 --- a/Documentation/misc-devices/index.rst +++ b/Documentation/misc-devices/index.rst @@ -14,4 +14,9 @@ fit into other categories. .. toctree:: :maxdepth: 2 + eeprom ibmvmc + ics932s401 + isl29003 + lis3lv02d + max6875 diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003 deleted file mode 100644 index 80b952fd32ff..000000000000 --- a/Documentation/misc-devices/isl29003 +++ /dev/null @@ -1,62 +0,0 @@ -Kernel driver isl29003 -===================== - -Supported chips: -* Intersil ISL29003 -Prefix: 'isl29003' -Addresses scanned: none -Datasheet: -http://www.intersil.com/data/fn/fn7464.pdf - -Author: Daniel Mack - - -Description ------------ -The ISL29003 is an integrated light sensor with a 16-bit integrating type -ADC, I2C user programmable lux range select for optimized counts/lux, and -I2C multi-function control and monitoring capabilities. The internal ADC -provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by -artificial light sources. - -The driver allows to set the lux range, the bit resolution, the operational -mode (see below) and the power state of device and can read the current lux -value, of course. - - -Detection ---------- - -The ISL29003 does not have an ID register which could be used to identify -it, so the detection routine will just try to read from the configured I2C -address and consider the device to be present as soon as it ACKs the -transfer. - - -Sysfs entries -------------- - -range: - 0: 0 lux to 1000 lux (default) - 1: 0 lux to 4000 lux - 2: 0 lux to 16,000 lux - 3: 0 lux to 64,000 lux - -resolution: - 0: 2^16 cycles (default) - 1: 2^12 cycles - 2: 2^8 cycles - 3: 2^4 cycles - -mode: - 0: diode1's current (unsigned 16bit) (default) - 1: diode1's current (unsigned 16bit) - 2: difference between diodes (l1 - l2, signed 15bit) - -power_state: - 0: device is disabled (default) - 1: device is enabled - -lux (read only): - returns the value from the last sensor reading - diff --git a/Documentation/misc-devices/isl29003.rst b/Documentation/misc-devices/isl29003.rst new file mode 100644 index 000000000000..0cc38aed6c00 --- /dev/null +++ b/Documentation/misc-devices/isl29003.rst @@ -0,0 +1,75 @@ +====================== +Kernel driver isl29003 +====================== + +Supported chips: + +* Intersil ISL29003 + +Prefix: 'isl29003' + +Addresses scanned: none + +Datasheet: +http://www.intersil.com/data/fn/fn7464.pdf + +Author: Daniel Mack + + +Description +----------- +The ISL29003 is an integrated light sensor with a 16-bit integrating type +ADC, I2C user programmable lux range select for optimized counts/lux, and +I2C multi-function control and monitoring capabilities. The internal ADC +provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by +artificial light sources. + +The driver allows to set the lux range, the bit resolution, the operational +mode (see below) and the power state of device and can read the current lux +value, of course. + + +Detection +--------- + +The ISL29003 does not have an ID register which could be used to identify +it, so the detection routine will just try to read from the configured I2C +address and consider the device to be present as soon as it ACKs the +transfer. + + +Sysfs entries +------------- + +range: + == =========================== + 0: 0 lux to 1000 lux (default) + 1: 0 lux to 4000 lux + 2: 0 lux to 16,000 lux + 3: 0 lux to 64,000 lux + == =========================== + +resolution: + == ===================== + 0: 2^16 cycles (default) + 1: 2^12 cycles + 2: 2^8 cycles + 3: 2^4 cycles + == ===================== + +mode: + == ================================================= + 0: diode1's current (unsigned 16bit) (default) + 1: diode1's current (unsigned 16bit) + 2: difference between diodes (l1 - l2, signed 15bit) + == ================================================= + +power_state: + == ================================================= + 0: device is disabled (default) + 1: device is enabled + == ================================================= + +lux (read only): + returns the value from the last sensor reading + diff --git a/Documentation/misc-devices/lis3lv02d b/Documentation/misc-devices/lis3lv02d deleted file mode 100644 index f89960a0ff95..000000000000 --- a/Documentation/misc-devices/lis3lv02d +++ /dev/null @@ -1,93 +0,0 @@ -Kernel driver lis3lv02d -======================= - -Supported chips: - - * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision) - * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and - LIS331DLH (16 bits) - -Authors: - Yan Burman - Eric Piel - - -Description ------------ - -This driver provides support for the accelerometer found in various HP laptops -sporting the feature officially called "HP Mobile Data Protection System 3D" or -"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known -models (full list can be found in drivers/platform/x86/hp_accel.c) will have -their axis automatically oriented on standard way (eg: you can directly play -neverball). The accelerometer data is readable via -/sys/devices/platform/lis3lv02d. Reported values are scaled -to mg values (1/1000th of earth gravity). - -Sysfs attributes under /sys/devices/platform/lis3lv02d/: -position - 3D position that the accelerometer reports. Format: "(x,y,z)" -rate - read reports the sampling rate of the accelerometer device in HZ. - write changes sampling rate of the accelerometer device. - Only values which are supported by HW are accepted. -selftest - performs selftest for the chip as specified by chip manufacturer. - -This driver also provides an absolute input class device, allowing -the laptop to act as a pinball machine-esque joystick. Joystick device can be -calibrated. Joystick device can be in two different modes. -By default output values are scaled between -32768 .. 32767. In joystick raw -mode, joystick and sysfs position entry have the same scale. There can be -small difference due to input system fuzziness feature. -Events are also available as input event device. - -Selftest is meant only for hardware diagnostic purposes. It is not meant to be -used during normal operations. Position data is not corrupted during selftest -but interrupt behaviour is not guaranteed to work reliably. In test mode, the -sensing element is internally moved little bit. Selftest measures difference -between normal mode and test mode. Chip specifications tell the acceptance -limit for each type of the chip. Limits are provided via platform data -to allow adjustment of the limits without a change to the actual driver. -Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are -measured difference between modes. Axes are not remapped in selftest mode. -Measurement values are provided to help HW diagnostic applications to make -final decision. - -On HP laptops, if the led infrastructure is activated, support for a led -indicating disk protection will be provided as /sys/class/leds/hp::hddprotect. - -Another feature of the driver is misc device called "freefall" that -acts similar to /dev/rtc and reacts on free-fall interrupts received -from the device. It supports blocking operations, poll/select and -fasync operation modes. You must read 1 bytes from the device. The -result is number of free-fall interrupts since the last successful -read (or 255 if number of interrupts would not fit). See the freefall.c -file for an example on using the device. - - -Axes orientation ----------------- - -For better compatibility between the various laptops. The values reported by -the accelerometer are converted into a "standard" organisation of the axes -(aka "can play neverball out of the box"): - * When the laptop is horizontal the position reported is about 0 for X and Y - and a positive value for Z - * If the left side is elevated, X increases (becomes positive) - * If the front side (where the touchpad is) is elevated, Y decreases - (becomes negative) - * If the laptop is put upside-down, Z becomes negative - -If your laptop model is not recognized (cf "dmesg"), you can send an -email to the maintainer to add it to the database. When reporting a new -laptop, please include the output of "dmidecode" plus the value of -/sys/devices/platform/lis3lv02d/position in these four cases. - -Q&A ---- - -Q: How do I safely simulate freefall? I have an HP "portable -workstation" which has about 3.5kg and a plastic case, so letting it -fall to the ground is out of question... - -A: The sensor is pretty sensitive, so your hands can do it. Lift it -into free space, follow the fall with your hands for like 10 -centimeters. That should be enough to trigger the detection. diff --git a/Documentation/misc-devices/lis3lv02d.rst b/Documentation/misc-devices/lis3lv02d.rst new file mode 100644 index 000000000000..959bd2b822cf --- /dev/null +++ b/Documentation/misc-devices/lis3lv02d.rst @@ -0,0 +1,99 @@ +======================= +Kernel driver lis3lv02d +======================= + +Supported chips: + + * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision) + * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and + LIS331DLH (16 bits) + +Authors: + - Yan Burman + - Eric Piel + + +Description +----------- + +This driver provides support for the accelerometer found in various HP laptops +sporting the feature officially called "HP Mobile Data Protection System 3D" or +"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known +models (full list can be found in drivers/platform/x86/hp_accel.c) will have +their axis automatically oriented on standard way (eg: you can directly play +neverball). The accelerometer data is readable via +/sys/devices/platform/lis3lv02d. Reported values are scaled +to mg values (1/1000th of earth gravity). + +Sysfs attributes under /sys/devices/platform/lis3lv02d/: + +position + - 3D position that the accelerometer reports. Format: "(x,y,z)" +rate + - read reports the sampling rate of the accelerometer device in HZ. + write changes sampling rate of the accelerometer device. + Only values which are supported by HW are accepted. +selftest + - performs selftest for the chip as specified by chip manufacturer. + +This driver also provides an absolute input class device, allowing +the laptop to act as a pinball machine-esque joystick. Joystick device can be +calibrated. Joystick device can be in two different modes. +By default output values are scaled between -32768 .. 32767. In joystick raw +mode, joystick and sysfs position entry have the same scale. There can be +small difference due to input system fuzziness feature. +Events are also available as input event device. + +Selftest is meant only for hardware diagnostic purposes. It is not meant to be +used during normal operations. Position data is not corrupted during selftest +but interrupt behaviour is not guaranteed to work reliably. In test mode, the +sensing element is internally moved little bit. Selftest measures difference +between normal mode and test mode. Chip specifications tell the acceptance +limit for each type of the chip. Limits are provided via platform data +to allow adjustment of the limits without a change to the actual driver. +Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are +measured difference between modes. Axes are not remapped in selftest mode. +Measurement values are provided to help HW diagnostic applications to make +final decision. + +On HP laptops, if the led infrastructure is activated, support for a led +indicating disk protection will be provided as /sys/class/leds/hp::hddprotect. + +Another feature of the driver is misc device called "freefall" that +acts similar to /dev/rtc and reacts on free-fall interrupts received +from the device. It supports blocking operations, poll/select and +fasync operation modes. You must read 1 bytes from the device. The +result is number of free-fall interrupts since the last successful +read (or 255 if number of interrupts would not fit). See the freefall.c +file for an example on using the device. + + +Axes orientation +---------------- + +For better compatibility between the various laptops. The values reported by +the accelerometer are converted into a "standard" organisation of the axes +(aka "can play neverball out of the box"): + + * When the laptop is horizontal the position reported is about 0 for X and Y + and a positive value for Z + * If the left side is elevated, X increases (becomes positive) + * If the front side (where the touchpad is) is elevated, Y decreases + (becomes negative) + * If the laptop is put upside-down, Z becomes negative + +If your laptop model is not recognized (cf "dmesg"), you can send an +email to the maintainer to add it to the database. When reporting a new +laptop, please include the output of "dmidecode" plus the value of +/sys/devices/platform/lis3lv02d/position in these four cases. + +Q&A +--- + +Q: How do I safely simulate freefall? I have an HP "portable +workstation" which has about 3.5kg and a plastic case, so letting it +fall to the ground is out of question... + +A: The sensor is pretty sensitive, so your hands can do it. Lift it +into free space, follow the fall with your hands for like 10 +centimeters. That should be enough to trigger the detection. diff --git a/Documentation/misc-devices/max6875 b/Documentation/misc-devices/max6875 deleted file mode 100644 index 2f2bd0b17b5d..000000000000 --- a/Documentation/misc-devices/max6875 +++ /dev/null @@ -1,110 +0,0 @@ -Kernel driver max6875 -===================== - -Supported chips: - * Maxim MAX6874, MAX6875 - Prefix: 'max6875' - Addresses scanned: None (see below) - Datasheet: - http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf - -Author: Ben Gardner - - -Description ------------ - -The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor. -It provides timed outputs that can be used as a watchdog, if properly wired. -It also provides 512 bytes of user EEPROM. - -At reset, the MAX6875 reads the configuration EEPROM into its configuration -registers. The chip then begins to operate according to the values in the -registers. - -The Maxim MAX6874 is a similar, mostly compatible device, with more inputs -and outputs: - vin gpi vout -MAX6874 6 4 8 -MAX6875 4 3 5 - -See the datasheet for more information. - - -Sysfs entries -------------- - -eeprom - 512 bytes of user-defined EEPROM space. - - -General Remarks ---------------- - -Valid addresses for the MAX6875 are 0x50 and 0x52. -Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56. -The driver does not probe any address, so you explicitly instantiate the -devices. - -Example: -$ modprobe max6875 -$ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device - -The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple -addresses. For example, for address 0x50, it also reserves 0x51. -The even-address instance is called 'max6875', the odd one is 'dummy'. - - -Programming the chip using i2c-dev ----------------------------------- - -Use the i2c-dev interface to access and program the chips. -Reads and writes are performed differently depending on the address range. - -The configuration registers are at addresses 0x00 - 0x45. -Use i2c_smbus_write_byte_data() to write a register and -i2c_smbus_read_byte_data() to read a register. -The command is the register number. - -Examples: -To write a 1 to register 0x45: - i2c_smbus_write_byte_data(fd, 0x45, 1); - -To read register 0x45: - value = i2c_smbus_read_byte_data(fd, 0x45); - - -The configuration EEPROM is at addresses 0x8000 - 0x8045. -The user EEPROM is at addresses 0x8100 - 0x82ff. - -Use i2c_smbus_write_word_data() to write a byte to EEPROM. - -The command is the upper byte of the address: 0x80, 0x81, or 0x82. -The data word is the lower part of the address or'd with data << 8. - cmd = address >> 8; - val = (address & 0xff) | (data << 8); - -Example: -To write 0x5a to address 0x8003: - i2c_smbus_write_word_data(fd, 0x80, 0x5a03); - - -Reading data from the EEPROM is a little more complicated. -Use i2c_smbus_write_byte_data() to set the read address and then -i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data. - -Example: -To read data starting at offset 0x8100, first set the address: - i2c_smbus_write_byte_data(fd, 0x81, 0x00); - -And then read the data - value = i2c_smbus_read_byte(fd); - - or - - count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer); - -The block read should read 16 bytes. -0x84 is the block read command. - -See the datasheet for more details. - diff --git a/Documentation/misc-devices/max6875.rst b/Documentation/misc-devices/max6875.rst new file mode 100644 index 000000000000..ad419ac22a5b --- /dev/null +++ b/Documentation/misc-devices/max6875.rst @@ -0,0 +1,136 @@ +===================== +Kernel driver max6875 +===================== + +Supported chips: + + * Maxim MAX6874, MAX6875 + + Prefix: 'max6875' + + Addresses scanned: None (see below) + + Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf + +Author: Ben Gardner + + +Description +----------- + +The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor. +It provides timed outputs that can be used as a watchdog, if properly wired. +It also provides 512 bytes of user EEPROM. + +At reset, the MAX6875 reads the configuration EEPROM into its configuration +registers. The chip then begins to operate according to the values in the +registers. + +The Maxim MAX6874 is a similar, mostly compatible device, with more inputs +and outputs: + +=========== === === ==== +- vin gpi vout +=========== === === ==== +MAX6874 6 4 8 +MAX6875 4 3 5 +=========== === === ==== + +See the datasheet for more information. + + +Sysfs entries +------------- + +eeprom - 512 bytes of user-defined EEPROM space. + + +General Remarks +--------------- + +Valid addresses for the MAX6875 are 0x50 and 0x52. + +Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56. + +The driver does not probe any address, so you explicitly instantiate the +devices. + +Example:: + + $ modprobe max6875 + $ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device + +The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple +addresses. For example, for address 0x50, it also reserves 0x51. +The even-address instance is called 'max6875', the odd one is 'dummy'. + + +Programming the chip using i2c-dev +---------------------------------- + +Use the i2c-dev interface to access and program the chips. + +Reads and writes are performed differently depending on the address range. + +The configuration registers are at addresses 0x00 - 0x45. + +Use i2c_smbus_write_byte_data() to write a register and +i2c_smbus_read_byte_data() to read a register. + +The command is the register number. + +Examples: + +To write a 1 to register 0x45:: + + i2c_smbus_write_byte_data(fd, 0x45, 1); + +To read register 0x45:: + + value = i2c_smbus_read_byte_data(fd, 0x45); + + +The configuration EEPROM is at addresses 0x8000 - 0x8045. + +The user EEPROM is at addresses 0x8100 - 0x82ff. + +Use i2c_smbus_write_word_data() to write a byte to EEPROM. + +The command is the upper byte of the address: 0x80, 0x81, or 0x82. +The data word is the lower part of the address or'd with data << 8:: + + cmd = address >> 8; + val = (address & 0xff) | (data << 8); + +Example: + +To write 0x5a to address 0x8003:: + + i2c_smbus_write_word_data(fd, 0x80, 0x5a03); + + +Reading data from the EEPROM is a little more complicated. + +Use i2c_smbus_write_byte_data() to set the read address and then +i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data. + +Example: + +To read data starting at offset 0x8100, first set the address:: + + i2c_smbus_write_byte_data(fd, 0x81, 0x00); + +And then read the data:: + + value = i2c_smbus_read_byte(fd); + +or:: + + count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer); + +The block read should read 16 bytes. + +0x84 is the block read command. + +See the datasheet for more details. + -- cgit From 7fc1148cfde1c0d53930c7a5ac62b5bffe7b0e0b Mon Sep 17 00:00:00 2001 From: Zhiyong Tao Date: Wed, 24 Apr 2019 09:11:11 +0800 Subject: dt-bindings: adc: mt8183: add binding document The commit adds mt8183 compatible node in binding document. Signed-off-by: Zhiyong Tao Reviewed-by: Rob Herring Signed-off-by: Jonathan Cameron --- Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt index 0df9befdaecc..936a0b4666da 100644 --- a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt +++ b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt @@ -15,6 +15,7 @@ Required properties: - "mediatek,mt2712-auxadc": For MT2712 family of SoCs - "mediatek,mt7622-auxadc": For MT7622 family of SoCs - "mediatek,mt8173-auxadc": For MT8173 family of SoCs + - "mediatek,mt8183-auxadc", "mediatek,mt8173-auxadc": For MT8183 family of SoCs - reg: Address range of the AUXADC unit. - clocks: Should contain a clock specifier for each entry in clock-names - clock-names: Should contain "main". -- cgit From cf54f4dd07a652d9a0630ec4e82c5ff082d2a14e Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Sat, 27 Apr 2019 14:23:59 -0400 Subject: dt-bindings: iio: isl29018: convert bindings to YAML format Convert the isl29018 device tree bindings to the new YAML format. Signed-off-by: Brian Masney Reviewed-by: Rob Herring Signed-off-by: Jonathan Cameron --- .../devicetree/bindings/iio/light/isl29018.txt | 27 ----------- .../devicetree/bindings/iio/light/isl29018.yaml | 56 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 27 deletions(-) delete mode 100644 Documentation/devicetree/bindings/iio/light/isl29018.txt create mode 100644 Documentation/devicetree/bindings/iio/light/isl29018.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.txt b/Documentation/devicetree/bindings/iio/light/isl29018.txt deleted file mode 100644 index b9bbde3e13ed..000000000000 --- a/Documentation/devicetree/bindings/iio/light/isl29018.txt +++ /dev/null @@ -1,27 +0,0 @@ -* ISL 29018/29023/29035 I2C ALS, Proximity, and Infrared sensor - -Required properties: - - - compatible: Should be one of - "isil,isl29018" - "isil,isl29023" - "isil,isl29035" - - reg: the I2C address of the device - -Optional properties: - - - interrupts: the sole interrupt generated by the device - - Refer to interrupt-controller/interrupts.txt for generic interrupt client - node bindings. - - - vcc-supply: phandle to the regulator that provides power to the sensor. - -Example: - -isl29018@44 { - compatible = "isil,isl29018"; - reg = <0x44>; - interrupt-parent = <&gpio>; - interrupts = ; -}; diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.yaml b/Documentation/devicetree/bindings/iio/light/isl29018.yaml new file mode 100644 index 000000000000..cbb00be8f359 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/isl29018.yaml @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/light/isl29018.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: | + Intersil 29018/29023/29035 Ambient Light, Infrared Light, and Proximity Sensor + +maintainers: + - Brian Masney + +description: | + Ambient and infrared light sensing with proximity detection over an i2c + interface. + + https://www.renesas.com/us/en/www/doc/datasheet/isl29018.pdf + https://www.renesas.com/us/en/www/doc/datasheet/isl29023.pdf + https://www.renesas.com/us/en/www/doc/datasheet/isl29035.pdf + +properties: + compatible: + enum: + - isil,isl29018 + - isil,isl29023 + - isil,isl29035 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + vcc-supply: + description: Regulator that provides power to the sensor + +required: + - compatible + - reg + +examples: + - | + #include + + i2c { + + #address-cells = <1>; + #size-cells = <0>; + + sensor@44 { + compatible = "isil,isl29018"; + reg = <0x44>; + interrupts-extended = <&msmgpio 61 IRQ_TYPE_LEVEL_HIGH>; + }; + }; +... -- cgit From f7b99e5948e6be8b850eecbb2de1986d26a76b36 Mon Sep 17 00:00:00 2001 From: Radu Pirea Date: Sun, 5 May 2019 21:06:45 +0300 Subject: dt-bindings: mfd: atmel-usart: add DMA bindings for USART in SPI mode The bindings for DMA are now common for both drivers of the USART IP. The node given as an example for USART in SPI mode has been updated in order to include DMA bindings. Signed-off-by: Radu Pirea Signed-off-by: Mark Brown --- .../devicetree/bindings/mfd/atmel-usart.txt | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/atmel-usart.txt b/Documentation/devicetree/bindings/mfd/atmel-usart.txt index 7f0cd72f47d2..699fd3c9ace8 100644 --- a/Documentation/devicetree/bindings/mfd/atmel-usart.txt +++ b/Documentation/devicetree/bindings/mfd/atmel-usart.txt @@ -17,17 +17,24 @@ Required properties for USART in SPI mode: - cs-gpios: chipselects (internal cs not supported) - atmel,usart-mode : Must be (found in dt-bindings/mfd/at91-usart.h) +Optional properties in serial and SPI mode: +- dma bindings for dma transfer: + - dmas: DMA specifier, consisting of a phandle to DMA controller node, + memory peripheral interface and USART DMA channel ID, FIFO configuration. + The order of DMA channels is fixed. The first DMA channel must be TX + associated channel and the second one must be RX associated channel. + Refer to dma.txt and atmel-dma.txt for details. + - dma-names: "tx" for TX channel. + "rx" for RX channel. + The order of dma-names is also fixed. The first name must be "tx" + and the second one must be "rx" as in the examples below. + Optional properties in serial mode: - atmel,use-dma-rx: use of PDC or DMA for receiving data - atmel,use-dma-tx: use of PDC or DMA for transmitting data - {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD line respectively. It will use specified PIO instead of the peripheral function pin for the USART feature. If unsure, don't specify this property. -- add dma bindings for dma transfer: - - dmas: DMA specifier, consisting of a phandle to DMA controller node, - memory peripheral interface and USART DMA channel ID, FIFO configuration. - Refer to dma.txt and atmel-dma.txt for details. - - dma-names: "rx" for RX channel, "tx" for TX channel. - atmel,fifo-size: maximum number of data the RX and TX FIFOs can store for FIFO capable USARTs. - rs485-rts-delay, rs485-rx-during-tx, linux,rs485-enabled-at-boot-time: see rs485.txt @@ -81,5 +88,8 @@ Example: interrupts = <12 IRQ_TYPE_LEVEL_HIGH 5>; clocks = <&usart0_clk>; clock-names = "usart"; + dmas = <&dma0 2 AT91_DMA_CFG_PER_ID(3)>, + <&dma0 2 (AT91_DMA_CFG_PER_ID(4) | AT91_DMA_CFG_FIFOCFG_ASAP)>; + dma-names = "tx", "rx"; cs-gpios = <&pioB 3 0>; }; -- cgit From 977bfde5d4cbb9d2eb1969eb2eb671cdb9b9fcbf Mon Sep 17 00:00:00 2001 From: Eric Jeong Date: Thu, 18 Apr 2019 15:09:44 +0900 Subject: dt-bindings: regulator: add document bindings for slg51000 Add device tree binding information for slg51000 regulator driver. Example bindings for SLG51000 are added. Signed-off-by: Eric Jeong Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/slg51000.txt | 88 ++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 Documentation/devicetree/bindings/regulator/slg51000.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/regulator/slg51000.txt b/Documentation/devicetree/bindings/regulator/slg51000.txt new file mode 100644 index 000000000000..aa0733e49b90 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/slg51000.txt @@ -0,0 +1,88 @@ +* Dialog Semiconductor SLG51000 Voltage Regulator + +Required properties: +- compatible : Should be "dlg,slg51000" for SLG51000 +- reg : Specifies the I2C slave address. +- xxx-supply: Input voltage supply regulator for ldo3 to ldo7. + These entries are required if regulators are enabled for a device. + An absence of these properties can cause the regulator registration to fail. + If some of input supply is powered through battery or always-on supply then + also it is required to have these parameters with proper node handle of always + on power supply. + vin3-supply: Input supply for ldo3 + vin4-supply: Input supply for ldo4 + vin5-supply: Input supply for ldo5 + vin6-supply: Input supply for ldo6 + vin7-supply: Input supply for ldo7 + +Optional properties: +- interrupt-parent : Specifies the reference to the interrupt controller. +- interrupts : IRQ line information. +- dlg,cs-gpios : Specify a valid GPIO for chip select + +Sub-nodes: +- regulators : This node defines the settings for the regulators. + The content of the sub-node is defined by the standard binding + for regulators; see regulator.txt. + + The SLG51000 regulators are bound using their names listed below: + ldo1 + ldo2 + ldo3 + ldo4 + ldo5 + ldo6 + ldo7 + +Optional properties for regulators: +- enable-gpios : Specify a valid GPIO for platform control of the regulator. + +Example: + pmic: slg51000@75 { + compatible = "dlg,slg51000"; + reg = <0x75>; + + regulators { + ldo1 { + regulator-name = "ldo1"; + regulator-min-microvolt = <2400000>; + regulator-max-microvolt = <3300000>; + }; + + ldo2 { + regulator-name = "ldo2"; + regulator-min-microvolt = <2400000>; + regulator-max-microvolt = <3300000>; + }; + + ldo3 { + regulator-name = "ldo3"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3750000>; + }; + + ldo4 { + regulator-name = "ldo4"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3750000>; + }; + + ldo5 { + regulator-name = "ldo5"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <1200000>; + }; + + ldo6 { + regulator-name = "ldo6"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <1200000>; + }; + + ldo7 { + regulator-name = "ldo7"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <3750000>; + }; + }; + }; -- cgit From e35f5ad6a965de5d301ca5957a1c48c53fe366fb Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 15 May 2019 15:18:56 +0200 Subject: ASoC: meson: add tohdmitx DT bindings Add the bindings and the related documentation for the audio hdmitx control glue of the Amlogic g12a SoC family Signed-off-by: Jerome Brunet Tested-by: Neil Armstrong Tested-by: Kevin Hilman Signed-off-by: Mark Brown --- .../bindings/sound/amlogic,g12a-tohdmitx.txt | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt new file mode 100644 index 000000000000..aa6c35570d31 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt @@ -0,0 +1,55 @@ +* Amlogic HDMI Tx control glue + +Required properties: +- compatible: "amlogic,g12a-tohdmitx" +- reg: physical base address of the controller and length of memory + mapped region. +- #sound-dai-cells: should be 1. + +Example on the S905X2 SoC: + +tohdmitx: audio-controller@744 { + compatible = "amlogic,g12a-tohdmitx"; + reg = <0x0 0x744 0x0 0x4>; + #sound-dai-cells = <1>; +}; + +Example of an 'amlogic,axg-sound-card': + +sound { + compatible = "amlogic,axg-sound-card"; + +[...] + + dai-link-x { + sound-dai = <&tdmif_a>; + dai-format = "i2s"; + dai-tdm-slot-tx-mask-0 = <1 1>; + + codec-0 { + sound-dai = <&tohdmitx TOHDMITX_I2S_IN_A>; + }; + + codec-1 { + sound-dai = <&external_dac>; + }; + }; + + dai-link-y { + sound-dai = <&tdmif_c>; + dai-format = "i2s"; + dai-tdm-slot-tx-mask-0 = <1 1>; + + codec { + sound-dai = <&tohdmitx TOHDMITX_I2S_IN_C>; + }; + }; + + dai-link-z { + sound-dai = <&tohdmitx TOHDMITX_I2S_OUT>; + + codec { + sound-dai = <&hdmi_tx>; + }; + }; +}; -- cgit From 506c7f9b0612c46732aa74317f59a28d9477905b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 16 May 2019 09:59:25 +0200 Subject: dt-bindings: sound: Convert Allwinner SPDIF binding to YAML The Allwinner SoCs feature an SPDIF controller across multiple SoC generations. However, earlier generations were a bit simpler than the subsequent ones, and for example would always have RX and TX capabilities, and no reset lines. In order to express this, let's create two YAML schemas instead of the free form text we had before. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/sound/allwinner,sun4i-a10-spdif.yaml | 101 +++++++++++++++++++++ .../bindings/sound/sunxi,sun4i-spdif.txt | 42 --------- 2 files changed, 101 insertions(+), 42 deletions(-) create mode 100644 Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml delete mode 100644 Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml new file mode 100644 index 000000000000..5d72d48e923e --- /dev/null +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-spdif.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 S/PDIF Controller Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Liam Girdwood + - Mark Brown + - Maxime Ripard + +properties: + "#sound-dai-cells": + const: 0 + + compatible: + oneOf: + - const: allwinner,sun4i-a10-spdif + - const: allwinner,sun6i-a31-spdif + - const: allwinner,sun8i-h3-spdif + - items: + - const: allwinner,sun8i-a83t-spdif + - const: allwinner,sun8i-h3-spdif + - items: + - const: allwinner,sun50i-a64-spdif + - const: allwinner,sun8i-h3-spdif + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: Bus Clock + - description: Module Clock + + clock-names: + items: + - const: apb + - const: spdif + + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + + # Even though it only applies to subschemas under the conditionals, + # not listing them here will trigger a warning because of the + # additionalsProperties set to false. + resets: + maxItems: 1 + +allOf: + - if: + properties: + compatible: + contains: + enum: + - allwinner,sun6i-a31-spdif + - allwinner,sun8i-h3-spdif + + then: + required: + - resets + +required: + - "#sound-dai-cells" + - compatible + - reg + - interrupts + - clocks + - clock-names + - dmas + - dma-names + +additionalProperties: false + +examples: + - | + spdif: spdif@1c21000 { + #sound-dai-cells = <0>; + compatible = "allwinner,sun4i-a10-spdif"; + reg = <0x01c21000 0x40>; + interrupts = <13>; + clocks = <&apb0_gates 1>, <&spdif_clk>; + clock-names = "apb", "spdif"; + dmas = <&dma 0 2>, <&dma 0 2>; + dma-names = "rx", "tx"; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt b/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt deleted file mode 100644 index 0c64a209c2e9..000000000000 --- a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt +++ /dev/null @@ -1,42 +0,0 @@ -Allwinner Sony/Philips Digital Interface Format (S/PDIF) Controller - -The Allwinner S/PDIF audio block is a transceiver that allows the -processor to receive and transmit digital audio via an coaxial cable or -a fibre cable. -For now only playback is supported. - -Required properties: - - - compatible : should be one of the following: - - "allwinner,sun4i-a10-spdif": for the Allwinner A10 SoC - - "allwinner,sun6i-a31-spdif": for the Allwinner A31 SoC - - "allwinner,sun8i-h3-spdif": for the Allwinner H3 SoC - - - reg : Offset and length of the register set for the device. - - - interrupts : Contains the spdif interrupt. - - - dmas : Generic dma devicetree binding as described in - Documentation/devicetree/bindings/dma/dma.txt. - - - dma-names : Two dmas have to be defined, "tx" and "rx". - - - clocks : Contains an entry for each entry in clock-names. - - - clock-names : Includes the following entries: - "apb" clock for the spdif bus. - "spdif" clock for spdif controller. - - - resets : reset specifier for the ahb reset (A31 and newer only) - -Example: - -spdif: spdif@1c21000 { - compatible = "allwinner,sun4i-a10-spdif"; - reg = <0x01c21000 0x40>; - interrupts = <13>; - clocks = <&apb0_gates 1>, <&spdif_clk>; - clock-names = "apb", "spdif"; - dmas = <&dma 0 2>, <&dma 0 2>; - dma-names = "rx", "tx"; -}; -- cgit From b1f35dfd7c2f509b0736f1ff02c314130b6b773e Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 16 May 2019 09:59:26 +0200 Subject: dt-bindings: sound: sun4i-spdif: Document that the RX channel can be missing The H3 and compatibles controllers don't have any reception capabilities, even though it was never documented as such in the binding before. Therefore, on those controllers, we don't have the option to set an RX DMA channel. This was already done in the DTSI, but the binding itself was never updated. Let's add a special case in the schemas. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/sound/allwinner,sun4i-a10-spdif.yaml | 38 ++++++++++++++++------ 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml index 5d72d48e923e..a49ef2294a74 100644 --- a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml @@ -44,19 +44,11 @@ properties: - const: apb - const: spdif - dmas: - items: - - description: RX DMA Channel - - description: TX DMA Channel - - dma-names: - items: - - const: rx - - const: tx - # Even though it only applies to subschemas under the conditionals, # not listing them here will trigger a warning because of the # additionalsProperties set to false. + dmas: true + dma-names: true resets: maxItems: 1 @@ -73,6 +65,32 @@ allOf: required: - resets + - if: + properties: + compatible: + contains: + const: allwinner,sun8i-h3-spdif + + then: + properties: + dmas: + description: TX DMA Channel + + dma-names: + const: tx + + else: + properties: + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + required: - "#sound-dai-cells" - compatible -- cgit From e359a29225dde53fade5fa4bc3d957599fb5f9a5 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Fri, 17 May 2019 18:44:41 +0300 Subject: dt-bindings: iio: accel: adxl345: switch to YAML bindings The ADX345 supports both I2C & SPI bindings. This change switches from old text bindings, to YAML bindings, and also tries to make use of the recent multiple-examples support. Signed-off-by: Alexandru Ardelean Reviewed-by: Rob Herring Signed-off-by: Jonathan Cameron --- .../devicetree/bindings/iio/accel/adi,adxl345.yaml | 72 ++++++++++++++++++++++ .../devicetree/bindings/iio/accel/adxl345.txt | 39 ------------ 2 files changed, 72 insertions(+), 39 deletions(-) create mode 100644 Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml delete mode 100644 Documentation/devicetree/bindings/iio/accel/adxl345.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml new file mode 100644 index 000000000000..7ba167e2e1ea --- /dev/null +++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl345.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers + +maintainers: + - Michael Hennerich + +description: | + Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers that supports + both I2C & SPI interfaces. + http://www.analog.com/en/products/mems/accelerometers/adxl345.html + http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html + +properties: + compatible: + enum: + - adi,adxl345 + - adi,adxl375 + + reg: + maxItems: 1 + + spi-cpha: true + + spi-cpol: true + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +examples: + - | + #include + #include + i2c0 { + #address-cells = <1>; + #size-cells = <0>; + + /* Example for a I2C device node */ + accelerometer@2a { + compatible = "adi,adxl345"; + reg = <0x53>; + interrupt-parent = <&gpio0>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; + }; + - | + #include + #include + spi0 { + #address-cells = <1>; + #size-cells = <0>; + + /* Example for a SPI device node */ + accelerometer@0 { + compatible = "adi,adxl345"; + reg = <0>; + spi-max-frequency = <5000000>; + spi-cpol; + spi-cpha; + interrupt-parent = <&gpio0>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; + }; diff --git a/Documentation/devicetree/bindings/iio/accel/adxl345.txt b/Documentation/devicetree/bindings/iio/accel/adxl345.txt deleted file mode 100644 index f9525f6e3d43..000000000000 --- a/Documentation/devicetree/bindings/iio/accel/adxl345.txt +++ /dev/null @@ -1,39 +0,0 @@ -Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers - -http://www.analog.com/en/products/mems/accelerometers/adxl345.html -http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html - -Required properties: - - compatible : should be one of - "adi,adxl345" - "adi,adxl375" - - reg : the I2C address or SPI chip select number of the sensor - -Required properties for SPI bus usage: - - spi-max-frequency : set maximum clock frequency, must be 5000000 - - spi-cpol and spi-cpha : must be defined for adxl345 to enable SPI mode 3 - -Optional properties: - - interrupts: interrupt mapping for IRQ as documented in - Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - -Example for a I2C device node: - - accelerometer@2a { - compatible = "adi,adxl345"; - reg = <0x53>; - interrupt-parent = <&gpio1>; - interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; - }; - -Example for a SPI device node: - - accelerometer@0 { - compatible = "adi,adxl345"; - reg = <0>; - spi-max-frequency = <5000000>; - spi-cpol; - spi-cpha; - interrupt-parent = <&gpio1>; - interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; - }; -- cgit From 73e1ccdab3c34b8f9be6e55600d40a1478049121 Mon Sep 17 00:00:00 2001 From: Chun-Hung Wu Date: Thu, 16 May 2019 16:10:44 +0800 Subject: dt-bindings: iio: adc: mediatek: Add document for mt6765 Add compatible node for mt6765 auxadc Signed-off-by: Chun-Hung Wu Signed-off-by: Jonathan Cameron --- Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt index 936a0b4666da..78c06e05c8e5 100644 --- a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt +++ b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt @@ -13,6 +13,7 @@ Required properties: - compatible: Should be one of: - "mediatek,mt2701-auxadc": For MT2701 family of SoCs - "mediatek,mt2712-auxadc": For MT2712 family of SoCs + - "mediatek,mt6765-auxadc": For MT6765 family of SoCs - "mediatek,mt7622-auxadc": For MT7622 family of SoCs - "mediatek,mt8173-auxadc": For MT8173 family of SoCs - "mediatek,mt8183-auxadc", "mediatek,mt8173-auxadc": For MT8183 family of SoCs -- cgit From a02177a39344b643dccfa3b18fa5c1fc4984e1e5 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Mon, 13 May 2019 11:01:39 +0000 Subject: dt-bindings: imx-cpufreq-dt: Document opp-supported-hw usage The interpretation of opp-supported-hw bits for imx-cpufreq-dt driver is not very obvious so attempt to explain it. There is no OF compat string associated. Reviewed-by: Rob Herring Signed-off-by: Leonard Crestez Signed-off-by: Viresh Kumar --- .../devicetree/bindings/cpufreq/imx-cpufreq-dt.txt | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt new file mode 100644 index 000000000000..87bff5add3f9 --- /dev/null +++ b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt @@ -0,0 +1,37 @@ +i.MX CPUFreq-DT OPP bindings +================================ + +Certain i.MX SoCs support different OPPs depending on the "market segment" and +"speed grading" value which are written in fuses. These bits are combined with +the opp-supported-hw values for each OPP to check if the OPP is allowed. + +Required properties: +-------------------- + +For each opp entry in 'operating-points-v2' table: +- opp-supported-hw: Two bitmaps indicating: + - Supported speed grade mask + - Supported market segment mask + 0: Consumer + 1: Extended Consumer + 2: Industrial + 3: Automotive + +Example: +-------- + +opp_table { + compatible = "operating-points-v2"; + opp-1000000000 { + opp-hz = /bits/ 64 <1000000000>; + /* grade >= 0, consumer only */ + opp-supported-hw = <0xf>, <0x3>; + }; + + opp-1300000000 { + opp-hz = /bits/ 64 <1300000000>; + opp-microvolt = <1000000>; + /* grade >= 1, all segments */ + opp-supported-hw = <0xe>, <0x7>; + }; +} -- cgit From a28d1b67cf4f330fb532c8e515025b360d10691a Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 7 May 2019 21:38:48 +0200 Subject: dt-bindings: bus: Convert Allwinner RSB to a schema The newer Allwinner SoCs feature a bus designed for the PMIC, similar to I2C called RSB. Let's convert the device tree bindings for that bus over to a YAML schemas. Signed-off-by: Maxime Ripard --- .../bindings/bus/allwinner,sun8i-a23-rsb.yaml | 79 ++++++++++++++++++++++ .../devicetree/bindings/bus/sunxi-rsb.txt | 47 ------------- 2 files changed, 79 insertions(+), 47 deletions(-) create mode 100644 Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml delete mode 100644 Documentation/devicetree/bindings/bus/sunxi-rsb.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml new file mode 100644 index 000000000000..fc2f63860cc8 --- /dev/null +++ b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/bus/allwinner,sun8i-a23-rsb.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A23 RSB Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + compatible: + oneOf: + - const: allwinner,sun8i-a23-rsb + - items: + - const: allwinner,sun8i-a83t-rsb + - const: allwinner,sun8i-a23-rsb + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + + clock-frequency: + minimum: 1 + maximum: 20000000 + +patternProperties: + "^.*@[0-9a-fA-F]+$": + properties: + reg: + maxItems: 1 + + required: + - reg + +required: + - compatible + - reg + - interrupts + - clocks + - resets + +examples: + - | + rsb@1f03400 { + compatible = "allwinner,sun8i-a23-rsb"; + reg = <0x01f03400 0x400>; + interrupts = <0 39 4>; + clocks = <&apb0_gates 3>; + clock-frequency = <3000000>; + resets = <&apb0_rst 3>; + #address-cells = <1>; + #size-cells = <0>; + + pmic@3e3 { + compatible = "..."; + reg = <0x3e3>; + + /* ... */ + }; + }; + +additionalProperties: false diff --git a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt b/Documentation/devicetree/bindings/bus/sunxi-rsb.txt deleted file mode 100644 index eb3ed628c6f1..000000000000 --- a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt +++ /dev/null @@ -1,47 +0,0 @@ -Allwinner Reduced Serial Bus (RSB) controller - -The RSB controller found on later Allwinner SoCs is an SMBus like 2 wire -serial bus with 1 master and up to 15 slaves. It is represented by a node -for the controller itself, and child nodes representing the slave devices. - -Required properties : - - - reg : Offset and length of the register set for the controller. - - compatible : Shall be "allwinner,sun8i-a23-rsb". - - interrupts : The interrupt line associated to the RSB controller. - - clocks : The gate clk associated to the RSB controller. - - resets : The reset line associated to the RSB controller. - - #address-cells : shall be 1 - - #size-cells : shall be 0 - -Optional properties : - - - clock-frequency : Desired RSB bus clock frequency in Hz. Maximum is 20MHz. - If not set this defaults to 3MHz. - -Child nodes: - -An RSB controller node can contain zero or more child nodes representing -slave devices on the bus. Child 'reg' properties should contain the slave -device's hardware address. The hardware address is hardwired in the device, -which can normally be found in the datasheet. - -Example: - - rsb@1f03400 { - compatible = "allwinner,sun8i-a23-rsb"; - reg = <0x01f03400 0x400>; - interrupts = <0 39 4>; - clocks = <&apb0_gates 3>; - clock-frequency = <3000000>; - resets = <&apb0_rst 3>; - #address-cells = <1>; - #size-cells = <0>; - - pmic@3e3 { - compatible = "..."; - reg = <0x3e3>; - - /* ... */ - }; - }; -- cgit From 4c1ca625c622b7a9f04c2949fd1ffdc6effa86de Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 16 Apr 2019 19:20:47 -0600 Subject: platform/chrome: wilco_ec: Add Boot on AC support Boot on AC is a policy which makes the device boot from S5 when AC power is connected. This is useful for users who want to run their device headless or with a dock. Signed-off-by: Nick Crews Signed-off-by: Enric Balletbo i Serra --- Documentation/ABI/testing/sysfs-platform-wilco-ec | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-platform-wilco-ec (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-platform-wilco-ec b/Documentation/ABI/testing/sysfs-platform-wilco-ec new file mode 100644 index 000000000000..8e5d6eee44db --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-wilco-ec @@ -0,0 +1,9 @@ +What: /sys/bus/platform/devices/GOOG000C\:00/boot_on_ac +Date: April 2019 +KernelVersion: 5.3 +Description: + Boot on AC is a policy which makes the device boot from S5 + when AC power is connected. This is useful for users who + want to run their device headless or with a dock. + + Input should be parseable by kstrtou8() to 0 or 1. -- cgit From 2ad1f7a91449de48d4bd5d1ec361ba7bb9026505 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Wed, 8 May 2019 15:38:09 -0600 Subject: platform/chrome: wilco_ec: Remove 256 byte transfers The 0xF6 command, intended to send and receive 256 byte payloads to and from the EC, is not needed. The 0xF5 command for 32 byte payloads is sufficient. This patch removes support for the 0xF6 command and 256 byte payloads. Signed-off-by: Nick Crews Signed-off-by: Enric Balletbo i Serra --- Documentation/ABI/testing/debugfs-wilco-ec | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/debugfs-wilco-ec b/Documentation/ABI/testing/debugfs-wilco-ec index 73a5a66ddca6..9d8d9d2def5b 100644 --- a/Documentation/ABI/testing/debugfs-wilco-ec +++ b/Documentation/ABI/testing/debugfs-wilco-ec @@ -23,11 +23,9 @@ Description: For writing, bytes 0-1 indicate the message type, one of enum wilco_ec_msg_type. Byte 2+ consist of the data passed in the - request, starting at MBOX[0] - - At least three bytes are required for writing, two for the type - and at least a single byte of data. Only the first - EC_MAILBOX_DATA_SIZE bytes of MBOX will be used. + request, starting at MBOX[0]. At least three bytes are required + for writing, two for the type and at least a single byte of + data. Example: // Request EC info type 3 (EC firmware build date) @@ -40,7 +38,7 @@ Description: $ cat /sys/kernel/debug/wilco_ec/raw 00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 ..12/21/18.8... - Note that the first 32 bytes of the received MBOX[] will be - printed, even if some of the data is junk. It is up to you to - know how many of the first bytes of data are the actual - response. + Note that the first 16 bytes of the received MBOX[] will be + printed, even if some of the data is junk, and skipping bytes + 17 to 32. It is up to you to know how many of the first bytes of + data are the actual response. -- cgit From e15d92bee8184048e7419193263e62a0830cf365 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 20 May 2019 14:21:16 +0800 Subject: doc: ext2: update description of quota options for ext2 ext2 support user/group disk quota by specifying usrquota/grpquota option on mount, so fix the description in the doc properly. Signed-off-by: Chengguang Xu Signed-off-by: Jan Kara --- Documentation/filesystems/ext2.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt index a19973a4dd1e..94c2cf0292f5 100644 --- a/Documentation/filesystems/ext2.txt +++ b/Documentation/filesystems/ext2.txt @@ -57,7 +57,13 @@ noacl Don't support POSIX ACLs. nobh Do not attach buffer_heads to file pagecache. -grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. +quota, usrquota Enable user disk quota support + (requires CONFIG_QUOTA). + +grpquota Enable group disk quota support + (requires CONFIG_QUOTA). + +noquota option ls silently ignored by ext2. Specification -- cgit From 70ac79f5d1ef95e61b174e95192b3555a4caef2f Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Fri, 17 May 2019 10:26:29 +0100 Subject: dt-bindings: arm: renesas: Add HopeRun RZ/G2[M] boards This patch adds board HiHope RZ/G2M (the main board, powered by the R8A774A1) and board HiHope RZ/G2 EX (the expansion board that sits on top of the HiHope RZ/G2M). Both boards are made by Jiangsu HopeRun Software Co., Ltd. (a.k.a. HopeRun). Useful links: http://hihope.org/product/detail/rzg2 https://item.taobao.com/item.htm?spm=a2oq0.12575281.0.0.6bcf1debQpzkRS&ft=t&id=592177498472 http://www.hoperun.com/Cn/news/id/379 We already know that the HiHope RZ/G2 EX will also sit on the HiHope RZ/G2N, even though the HiHope RZ/G2N doesn't exist just yet. Signed-off-by: Fabrizio Castro Reviewed-by: Chris Paterson Reviewed-by: Rob Herring Signed-off-by: Simon Horman --- Documentation/devicetree/bindings/arm/renesas.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml b/Documentation/devicetree/bindings/arm/renesas.yaml index 19f379863d50..08c923f8c257 100644 --- a/Documentation/devicetree/bindings/arm/renesas.yaml +++ b/Documentation/devicetree/bindings/arm/renesas.yaml @@ -106,6 +106,14 @@ properties: - description: RZ/G2M (R8A774A1) items: + - enum: + - hoperun,hihope-rzg2m # HopeRun HiHope RZ/G2M platform + - const: renesas,r8a774a1 + + - items: + - enum: + - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms + - const: hoperun,hihope-rzg2m - const: renesas,r8a774a1 - description: RZ/G2E (R8A774C0) -- cgit From 90fdbe8ab27047f7b9da57d1975c14a8d8f39370 Mon Sep 17 00:00:00 2001 From: "S.j. Wang" Date: Thu, 16 May 2019 11:40:56 +0000 Subject: ASoC: cs42xx8: add reset-gpios in binding document Add reset-gpios property, which is optional. Signed-off-by: Shengjiu Wang Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/cs42xx8.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/cs42xx8.txt b/Documentation/devicetree/bindings/sound/cs42xx8.txt index 8619a156d038..bbfe39347c20 100644 --- a/Documentation/devicetree/bindings/sound/cs42xx8.txt +++ b/Documentation/devicetree/bindings/sound/cs42xx8.txt @@ -14,6 +14,11 @@ Required properties: - VA-supply, VD-supply, VLS-supply, VLC-supply: power supplies for the device, as covered in Documentation/devicetree/bindings/regulator/regulator.txt +Optional properties: + + - reset-gpios : a GPIO spec to define which pin is connected to the chip's + !RESET pin + Example: cs42888: codec@48 { @@ -25,4 +30,5 @@ cs42888: codec@48 { VD-supply = <®_audio>; VLS-supply = <®_audio>; VLC-supply = <®_audio>; + reset-gpios = <&pca9557_b 1 GPIO_ACTIVE_LOW>; }; -- cgit From f22558d701755f2f9cfdbd4a0862a58ef5fb1dbb Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:32 +0200 Subject: dt-bindings: olpc,xo1.75-ec: Add OLPC XO-1.75 EC bindings The OLPC XO-1.75 Embedded Controller is a SPI master that uses extra signals for handshaking. It needs to know when is the slave (Linux) side's TX FIFO ready for transfer (the ready-gpio signal on the SPI controller node) and when does it wish to respond with a command (the cmd-gpio property). Signed-off-by: Lubomir Rintel Acked-by: Pavel Machek Reviewed-by: Rob Herring Signed-off-by: Andy Shevchenko --- .../devicetree/bindings/misc/olpc,xo1.75-ec.txt | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt new file mode 100644 index 000000000000..8c4d649cdd8f --- /dev/null +++ b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt @@ -0,0 +1,23 @@ +OLPC XO-1.75 Embedded Controller + +Required properties: +- compatible: Should be "olpc,xo1.75-ec". +- cmd-gpios: gpio specifier of the CMD pin + +The embedded controller requires the SPI controller driver to signal readiness +to receive a transfer (that is, when TX FIFO contains the response data) by +strobing the ACK pin with the ready signal. See the "ready-gpios" property of the +SSP binding as documented in: +. + +Example: + &ssp3 { + spi-slave; + ready-gpios = <&gpio 125 GPIO_ACTIVE_HIGH>; + + slave { + compatible = "olpc,xo1.75-ec"; + spi-cpha; + cmd-gpios = <&gpio 155 GPIO_ACTIVE_HIGH>; + }; + }; -- cgit From 2f230f300497f695bec65439b20df3293eabf889 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 6 May 2019 14:16:09 -0500 Subject: dt-bindings: mfd: LMU: Add the ramp up/down property Document the ramp-up and ramp-down property in the binding. Removing the "sec" from the property definition as seconds is implied. Signed-off-by: Dan Murphy Reviewed-by: Rob Herring Acked-by: Lee Jones Signed-off-by: Jacek Anaszewski --- Documentation/devicetree/bindings/mfd/ti-lmu.txt | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt index 86ca786d54fc..160eb0a5e21a 100644 --- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt +++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt @@ -23,8 +23,14 @@ Required properties: 0x36 for LM3633, LM3697 0x63 for LM3695 -Optional property: +Optional properties: - enable-gpios: A GPIO specifier for hardware enable pin. + - ramp-up-us: Current ramping from one brightness level to + the a higher brightness level. + Range from 2048 us - 117.44 s + - ramp-down-us: Current ramping from one brightness level to + the a lower brightness level. + Range from 2048 us - 117.44 s Required node: - backlight: All LMU devices have backlight child nodes. @@ -90,7 +96,7 @@ lm3631@29 { lcd_bl { led-sources = <0 1>; - ramp-up-msec = <300>; + ramp-up-us = <300000>; }; }; }; @@ -152,15 +158,15 @@ lm3633@36 { main { label = "main_lcd"; led-sources = <1 2>; - ramp-up-msec = <500>; - ramp-down-msec = <500>; + ramp-up-us = <500000>; + ramp-down-us = <500000>; }; front { label = "front_lcd"; led-sources = <0>; - ramp-up-msec = <1000>; - ramp-down-msec = <0>; + ramp-up-us = <1000000>; + ramp-down-us = <0>; }; }; @@ -212,8 +218,8 @@ lm3697@36 { lcd { led-sources = <0 1 2>; - ramp-up-msec = <200>; - ramp-down-msec = <200>; + ramp-up-us = <200000>; + ramp-down-us = <200000>; }; }; -- cgit From d0147554d004e7cc427f6cfecf871916e34e8b67 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 6 May 2019 14:16:10 -0500 Subject: dt-bindings: mfd: LMU: Add ti,brightness-resolution Add ti,brightness-resolution to the TI LMU binding to define whether the device uses 8-bit brightness or 11-bit brightness. Signed-off-by: Dan Murphy Reviewed-by: Rob Herring Signed-off-by: Jacek Anaszewski --- Documentation/devicetree/bindings/mfd/ti-lmu.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt index 160eb0a5e21a..e3efefb194c5 100644 --- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt +++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt @@ -31,6 +31,16 @@ Optional properties: - ramp-down-us: Current ramping from one brightness level to the a lower brightness level. Range from 2048 us - 117.44 s + - ti,brightness-resolution - This determines whether to use 8 bit brightness + mode or 11 bit brightness mode. If this value is + not set the device is defaulted to the preferred + 8bit brightness mode per 7.3.4.1 of the data + sheet. This setting can either be in the parent + node or as part of the LED child nodes. This + is determined by the part itself if the strings + have a common brightness register or individual + brightness registers. + The values are 255 (8bit) or 2047 (11bit). Required node: - backlight: All LMU devices have backlight child nodes. @@ -217,6 +227,7 @@ lm3697@36 { compatible = "ti,lm3697-backlight"; lcd { + ti,brightness-resolution = <255>; led-sources = <0 1 2>; ramp-up-us = <200000>; ramp-down-us = <200000>; -- cgit From bbc9be3ac8ab022061e7cfcc050ae02240144ba1 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 26 Mar 2019 19:23:31 +0000 Subject: Documentation: DT: Add entry for DPAA2 console This patch adds a devicetree binding documentation for FSL's DPAA2 console. Signed-off-by: Ioana Ciornei Reviewed-by: Rob Herring Signed-off-by: Li Yang --- Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt new file mode 100644 index 000000000000..1442ba5d2d98 --- /dev/null +++ b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt @@ -0,0 +1,11 @@ +DPAA2 console support + +Required properties: + + - compatible + Value type: + Definition: Must be "fsl,dpaa2-console". + - reg + Value type: + Definition: A standard property. Specifies the region where the MCFBA + (MC firmware base address) register can be found. -- cgit From 804898e8bc43080c9194ae7e1807bf0e995bad73 Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Wed, 15 May 2019 10:20:39 -0500 Subject: dt-bindings: rcar-gen3-phy-usb2: Document dr_mode Document the optional dr_mode property Signed-off-by: Chris Brandt Reviewed-by: Rob Herring Reviewed-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt index d46188f450bf..64afd086e606 100644 --- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt +++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt @@ -46,6 +46,9 @@ channel as USB OTG: regulator will be managed during the PHY power on/off sequence. - renesas,no-otg-pins: boolean, specify when a board does not provide proper otg pins. +- dr_mode: string, indicates the working mode for the PHY. Can be "host", + "peripheral", or "otg". Should be set if otg controller is not used. + Example (R-Car H3): -- cgit From b051c93746543385f398b6b8002a51e9d43143a6 Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Wed, 15 May 2019 10:20:40 -0500 Subject: dt-bindings: rcar-gen3-phy-usb2: Add r7s9210 support Document RZ/A2 (R7S9210) SoC bindings. Signed-off-by: Chris Brandt Reviewed-by: Rob Herring Reviewed-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt index 64afd086e606..503a8cfb3184 100644 --- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt +++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt @@ -1,10 +1,12 @@ * Renesas R-Car generation 3 USB 2.0 PHY This file provides information on what the device node for the R-Car generation -3, RZ/G1C and RZ/G2 USB 2.0 PHY contain. +3, RZ/G1C, RZ/G2 and RZ/A2 USB 2.0 PHY contain. Required properties: -- compatible: "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470 +- compatible: "renesas,usb2-phy-r7s9210" if the device is a part of an R7S9210 + SoC. + "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470 SoC. "renesas,usb2-phy-r8a774a1" if the device is a part of an R8A774A1 SoC. @@ -20,8 +22,8 @@ Required properties: R8A77990 SoC. "renesas,usb2-phy-r8a77995" if the device is a part of an R8A77995 SoC. - "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3 or RZ/G2 - compatible device. + "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3, RZ/G2 or + RZ/A2 compatible device. When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first -- cgit From 6e9aed4ed4ca129510fcb1af495391d4717246d6 Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Wed, 15 May 2019 10:20:45 -0500 Subject: dt-bindings: usb: renesas_usbhs: Add support for r7s9210 Add support for r7s9210 (RZ/A2M) SoC Signed-off-by: Chris Brandt Reviewed-by: Rob Herring Reviewed-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/renesas_usbhs.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt index b8acc2a994a8..e39255ea6e4f 100644 --- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt +++ b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt @@ -20,9 +20,11 @@ Required properties: - "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device - "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device - "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device + - "renesas,usbhs-r7s9210" for r7s9210 (RZ/A2) compatible device - "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices - "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices - "renesas,rza1-usbhs" for RZ/A1 compatible device + - "renesas,rza2-usbhs" for RZ/A2 compatible device When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first followed -- cgit From 221fb7268d67c0867a93f23586bd53c3c3969eee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 20 May 2019 14:22:25 -0700 Subject: Documentation/networking: fix af_xdp.rst Sphinx warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix Sphinx warnings in Documentation/networking/af_xdp.rst by adding indentation: Documentation/networking/af_xdp.rst:319: WARNING: Literal block expected; none found. Documentation/networking/af_xdp.rst:326: WARNING: Literal block expected; none found. Fixes: 0f4a9b7d4ecb ("xsk: add FAQ to facilitate for first time users") Signed-off-by: Randy Dunlap Cc: Magnus Karlsson Cc: Daniel Borkmann Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- Documentation/networking/af_xdp.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index e14d7d40fc75..50bccbf68308 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -316,16 +316,16 @@ A: When a netdev of a physical NIC is initialized, Linux usually all the traffic, you can force the netdev to only have 1 queue, queue id 0, and then bind to queue 0. You can use ethtool to do this:: - sudo ethtool -L combined 1 + sudo ethtool -L combined 1 If you want to only see part of the traffic, you can program the NIC through ethtool to filter out your traffic to a single queue id that you can bind your XDP socket to. Here is one example in which UDP traffic to and from port 4242 are sent to queue 2:: - sudo ethtool -N rx-flow-hash udp4 fn - sudo ethtool -N flow-type udp4 src-port 4242 dst-port \ - 4242 action 2 + sudo ethtool -N rx-flow-hash udp4 fn + sudo ethtool -N flow-type udp4 src-port 4242 dst-port \ + 4242 action 2 A number of other ways are possible all up to the capabilitites of the NIC you have. -- cgit From 0f202f69a16b9a687911202083c37636ad73c77e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 17 May 2019 10:27:22 -0500 Subject: dt-bindings: arm: amlogic: Move 'amlogic, meson-gx-ao-secure' binding to its own file It is best practice to have 1 binding per file, so board level bindings should be separate for various misc SoC bindings. Cc: Mark Rutland Cc: Carlo Caione Cc: Kevin Hilman Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Signed-off-by: Rob Herring Acked-by: Neil Armstrong Acked-by: Kevin Hilman Signed-off-by: Kevin Hilman --- Documentation/devicetree/bindings/arm/amlogic.txt | 29 ---------------------- .../arm/amlogic/amlogic,meson-gx-ao-secure.txt | 28 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 29 deletions(-) create mode 100644 Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt index 061f7b98a07f..5f650248b18e 100644 --- a/Documentation/devicetree/bindings/arm/amlogic.txt +++ b/Documentation/devicetree/bindings/arm/amlogic.txt @@ -111,32 +111,3 @@ Board compatible values (alphabetically, grouped by SoC): - "amlogic,u200" (Meson g12a s905d2) - "amediatech,x96-max" (Meson g12a s905x2) - "seirobotics,sei510" (Meson g12a s905x2) - -Amlogic Meson Firmware registers Interface ------------------------------------------- - -The Meson SoCs have a register bank with status and data shared with the -secure firmware. - -Required properties: - - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon" - -Properties should indentify components of this register interface : - -Meson GX SoC Information ------------------------- -A firmware register encodes the SoC type, package and revision information on -the Meson GX SoCs. -If present, the following property should be added : - -Optional properties: - - amlogic,has-chip-id: If present, the interface gives the current SoC version. - -Example -------- - -ao-secure@140 { - compatible = "amlogic,meson-gx-ao-secure", "syscon"; - reg = <0x0 0x140 0x0 0x140>; - amlogic,has-chip-id; -}; diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt new file mode 100644 index 000000000000..c67d9f48fb91 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt @@ -0,0 +1,28 @@ +Amlogic Meson Firmware registers Interface +------------------------------------------ + +The Meson SoCs have a register bank with status and data shared with the +secure firmware. + +Required properties: + - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon" + +Properties should indentify components of this register interface : + +Meson GX SoC Information +------------------------ +A firmware register encodes the SoC type, package and revision information on +the Meson GX SoCs. +If present, the following property should be added : + +Optional properties: + - amlogic,has-chip-id: If present, the interface gives the current SoC version. + +Example +------- + +ao-secure@140 { + compatible = "amlogic,meson-gx-ao-secure", "syscon"; + reg = <0x0 0x140 0x0 0x140>; + amlogic,has-chip-id; +}; -- cgit From c0c752d8c6b3dd78b3e7951453bc43be880dcd08 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 17 May 2019 10:27:23 -0500 Subject: dt-bindings: arm: Convert Amlogic board/soc bindings to json-schema Convert Amlogic SoC bindings to DT schema format using json-schema. Cc: Carlo Caione Cc: Kevin Hilman Cc: Mark Rutland Cc: devicetree@vger.kernel.org Signed-off-by: Rob Herring Reviewed-by: Neil Armstrong [khilman: updated maninainers] Signed-off-by: Kevin Hilman --- Documentation/devicetree/bindings/arm/amlogic.txt | 113 ----------------- Documentation/devicetree/bindings/arm/amlogic.yaml | 138 +++++++++++++++++++++ 2 files changed, 138 insertions(+), 113 deletions(-) delete mode 100644 Documentation/devicetree/bindings/arm/amlogic.txt create mode 100644 Documentation/devicetree/bindings/arm/amlogic.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt deleted file mode 100644 index 5f650248b18e..000000000000 --- a/Documentation/devicetree/bindings/arm/amlogic.txt +++ /dev/null @@ -1,113 +0,0 @@ -Amlogic MesonX device tree bindings -------------------------------------------- - -Work in progress statement: - -Device tree files and bindings applying to Amlogic SoCs and boards are -considered "unstable". Any Amlogic device tree binding may change at -any time. Be sure to use a device tree binary and a kernel image -generated from the same source tree. - -Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a -stable binding/ABI. - ---------------------------------------------------------------- - -Boards with the Amlogic Meson6 SoC shall have the following properties: - Required root node property: - compatible: "amlogic,meson6" - -Boards with the Amlogic Meson8 SoC shall have the following properties: - Required root node property: - compatible: "amlogic,meson8"; - -Boards with the Amlogic Meson8b SoC shall have the following properties: - Required root node property: - compatible: "amlogic,meson8b"; - -Boards with the Amlogic Meson8m2 SoC shall have the following properties: - Required root node property: - compatible: "amlogic,meson8m2"; - -Boards with the Amlogic Meson GXBaby SoC shall have the following properties: - Required root node property: - compatible: "amlogic,meson-gxbb"; - -Boards with the Amlogic Meson GXL S905X SoC shall have the following properties: - Required root node property: - compatible: "amlogic,s905x", "amlogic,meson-gxl"; - -Boards with the Amlogic Meson GXL S905D SoC shall have the following properties: - Required root node property: - compatible: "amlogic,s905d", "amlogic,meson-gxl"; - -Boards with the Amlogic Meson GXL S805X SoC shall have the following properties: - Required root node property: - compatible: "amlogic,s805x", "amlogic,meson-gxl"; - -Boards with the Amlogic Meson GXL S905W SoC shall have the following properties: - Required root node property: - compatible: "amlogic,s905w", "amlogic,meson-gxl"; - -Boards with the Amlogic Meson GXM S912 SoC shall have the following properties: - Required root node property: - compatible: "amlogic,s912", "amlogic,meson-gxm"; - -Boards with the Amlogic Meson AXG A113D SoC shall have the following properties: - Required root node property: - compatible: "amlogic,a113d", "amlogic,meson-axg"; - -Boards with the Amlogic Meson G12A S905D2 SoC shall have the following properties: - Required root node property: - compatible: "amlogic,g12a"; - -Board compatible values (alphabetically, grouped by SoC): - - - "geniatech,atv1200" (Meson6) - - - "minix,neo-x8" (Meson8) - - - "endless,ec100" (Meson8b) - - "hardkernel,odroid-c1" (Meson8b) - - "tronfy,mxq" (Meson8b) - - - "tronsmart,mxiii-plus" (Meson8m2) - - - "amlogic,p200" (Meson gxbb) - - "amlogic,p201" (Meson gxbb) - - "friendlyarm,nanopi-k2" (Meson gxbb) - - "hardkernel,odroid-c2" (Meson gxbb) - - "nexbox,a95x" (Meson gxbb or Meson gxl s905x) - - "tronsmart,vega-s95-pro", "tronsmart,vega-s95" (Meson gxbb) - - "tronsmart,vega-s95-meta", "tronsmart,vega-s95" (Meson gxbb) - - "tronsmart,vega-s95-telos", "tronsmart,vega-s95" (Meson gxbb) - - "wetek,hub" (Meson gxbb) - - "wetek,play2" (Meson gxbb) - - - "amlogic,p212" (Meson gxl s905x) - - "hwacom,amazetv" (Meson gxl s905x) - - "khadas,vim" (Meson gxl s905x) - - "libretech,cc" (Meson gxl s905x) - - - "amlogic,p230" (Meson gxl s905d) - - "amlogic,p231" (Meson gxl s905d) - - "phicomm,n1" (Meson gxl s905d) - - - "amlogic,p241" (Meson gxl s805x) - - "libretech,aml-s805x-ac" (Meson gxl s805x) - - - "amlogic,p281" (Meson gxl s905w) - - "oranth,tx3-mini" (Meson gxl s905w) - - - "amlogic,q200" (Meson gxm s912) - - "amlogic,q201" (Meson gxm s912) - - "khadas,vim2" (Meson gxm s912) - - "kingnovel,r-box-pro" (Meson gxm S912) - - "nexbox,a1" (Meson gxm s912) - - "tronsmart,vega-s96" (Meson gxm s912) - - - "amlogic,s400" (Meson axg a113d) - - - "amlogic,u200" (Meson g12a s905d2) - - "amediatech,x96-max" (Meson g12a s905x2) - - "seirobotics,sei510" (Meson g12a s905x2) diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml new file mode 100644 index 000000000000..a0dd12be4de4 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/amlogic.yaml @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/arm/amlogic.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Amlogic MesonX device tree bindings + +maintainers: + - Kevin Hilman + +description: |+ + Work in progress statement: + + Device tree files and bindings applying to Amlogic SoCs and boards are + considered "unstable". Any Amlogic device tree binding may change at + any time. Be sure to use a device tree binary and a kernel image + generated from the same source tree. + + Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a + stable binding/ABI. + +properties: + $nodename: + const: '/' + compatible: + oneOf: + - description: Boards with the Amlogic Meson6 SoC + items: + - enum: + - geniatech,atv1200 + - const: amlogic,meson6 + + - description: Boards with the Amlogic Meson8 SoC + items: + - enum: + - minix,neo-x8 + - const: amlogic,meson8 + + - description: Boards with the Amlogic Meson8m2 SoC + items: + - enum: + - tronsmart,mxiii-plus + - const: amlogic,meson8m2 + + - description: Boards with the Amlogic Meson8b SoC + items: + - enum: + - endless,ec100 + - hardkernel,odroid-c1 + - tronfy,mxq + - const: amlogic,meson8b + + - description: Boards with the Amlogic Meson GXBaby SoC + items: + - enum: + - amlogic,p200 + - amlogic,p201 + - friendlyarm,nanopi-k2 + - hardkernel,odroid-c2 + - nexbox,a95x + - wetek,hub + - wetek,play2 + - const: amlogic,meson-gxbb + + - description: Tronsmart Vega S95 devices + items: + - enum: + - tronsmart,vega-s95-pro + - tronsmart,vega-s95-meta + - tronsmart,vega-s95-telos + - const: tronsmart,vega-s95 + - const: amlogic,meson-gxbb + + - description: Boards with the Amlogic Meson GXL S805X SoC + items: + - enum: + - amlogic,p241 + - libretech,aml-s805x-ac + - const: amlogic,s805x + - const: amlogic,meson-gxl + + - description: Boards with the Amlogic Meson GXL S905W SoC + items: + - enum: + - amlogic,p281 + - oranth,tx3-mini + - const: amlogic,s905w + - const: amlogic,meson-gxl + + - description: Boards with the Amlogic Meson GXL S905X SoC + items: + - enum: + - amediatech,x96-max + - amlogic,p212 + - hwacom,amazetv + - khadas,vim + - libretech,cc + - nexbox,a95x + - seirobotics,sei510 + - const: amlogic,s905x + - const: amlogic,meson-gxl + + - description: Boards with the Amlogic Meson GXL S905D SoC + items: + - enum: + - amlogic,p230 + - amlogic,p231 + - phicomm,n1 + - const: amlogic,s905d + - const: amlogic,meson-gxl + + - description: Boards with the Amlogic Meson GXM S912 SoC + items: + - enum: + - amlogic,q200 + - amlogic,q201 + - khadas,vim2 + - kingnovel,r-box-pro + - nexbox,a1 + - tronsmart,vega-s96 + - const: amlogic,s912 + - const: amlogic,meson-gxm + + - description: Boards with the Amlogic Meson AXG A113D SoC + items: + - enum: + - amlogic,s400 + - const: amlogic,a113d + - const: amlogic,meson-axg + + - description: Boards with the Amlogic Meson G12A S905D2 SoC + items: + - enum: + - amlogic,u200 + - const: amlogic,g12a + +... -- cgit From 46f4050a6587ea3e59e6f47f06513c00689b9d40 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 21 May 2019 11:04:37 +0100 Subject: regulator: arizona: Update device tree binding to support Madera CODECs The Arizona regulator drivers are now also used by the Cirrus Logic Madera audio CODECs, update the binding document to link to the primary binding document for these as well as the existing Arizona document. Signed-off-by: Charles Keepax Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/regulator/arizona-regulator.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt index 443564d7784f..69bf41949b01 100644 --- a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt @@ -5,7 +5,8 @@ of analogue I/O. This document lists regulator specific bindings, see the primary binding document: - ../mfd/arizona.txt + For Wolfson Microelectronic Arizona codecs: ../mfd/arizona.txt + For Cirrus Logic Madera codecs: ../mfd/madera.txt Optional properties: - wlf,ldoena : GPIO specifier for the GPIO controlling LDOENA -- cgit From 0a1b929356830257568f9547e173f0e7498060ea Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 20 May 2019 16:50:33 +0200 Subject: spi: Add YAML schemas for the generic SPI options The SPI controllers have a bunch of generic options that are needed in a device tree. Add a YAML schemas for those. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-bus.txt | 112 +------------- .../devicetree/bindings/spi/spi-controller.yaml | 161 +++++++++++++++++++++ 2 files changed, 162 insertions(+), 111 deletions(-) create mode 100644 Documentation/devicetree/bindings/spi/spi-controller.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/spi-bus.txt b/Documentation/devicetree/bindings/spi/spi-bus.txt index 1f6e86f787ef..e07783505498 100644 --- a/Documentation/devicetree/bindings/spi/spi-bus.txt +++ b/Documentation/devicetree/bindings/spi/spi-bus.txt @@ -1,111 +1 @@ -SPI (Serial Peripheral Interface) busses - -SPI busses can be described with a node for the SPI controller device -and a set of child nodes for each SPI slave on the bus. The system's SPI -controller may be described for use in SPI master mode or in SPI slave mode, -but not for both at the same time. - -The SPI controller node requires the following properties: -- compatible - Name of SPI bus controller following generic names - recommended practice. - -In master mode, the SPI controller node requires the following additional -properties: -- #address-cells - number of cells required to define a chip select - address on the SPI bus. -- #size-cells - should be zero. - -In slave mode, the SPI controller node requires one additional property: -- spi-slave - Empty property. - -No other properties are required in the SPI bus node. It is assumed -that a driver for an SPI bus device will understand that it is an SPI bus. -However, the binding does not attempt to define the specific method for -assigning chip select numbers. Since SPI chip select configuration is -flexible and non-standardized, it is left out of this binding with the -assumption that board specific platform code will be used to manage -chip selects. Individual drivers can define additional properties to -support describing the chip select layout. - -Optional properties (master mode only): -- cs-gpios - gpios chip select. -- num-cs - total number of chipselects. - -If cs-gpios is used the number of chip selects will be increased automatically -with max(cs-gpios > hw cs). - -So if for example the controller has 2 CS lines, and the cs-gpios -property looks like this: - -cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>; - -Then it should be configured so that num_chipselect = 4 with the -following mapping: - -cs0 : &gpio1 0 0 -cs1 : native -cs2 : &gpio1 1 0 -cs3 : &gpio1 2 0 - - -SPI slave nodes must be children of the SPI controller node. - -In master mode, one or more slave nodes (up to the number of chip selects) can -be present. Required properties are: -- compatible - Name of SPI device following generic names recommended - practice. -- reg - Chip select address of device. -- spi-max-frequency - Maximum SPI clocking speed of device in Hz. - -In slave mode, the (single) slave node is optional. -If present, it must be called "slave". Required properties are: -- compatible - Name of SPI device following generic names recommended - practice. - -All slave nodes can contain the following optional properties: -- spi-cpol - Empty property indicating device requires inverse clock - polarity (CPOL) mode. -- spi-cpha - Empty property indicating device requires shifted clock - phase (CPHA) mode. -- spi-cs-high - Empty property indicating device requires chip select - active high. -- spi-3wire - Empty property indicating device requires 3-wire mode. -- spi-lsb-first - Empty property indicating device requires LSB first mode. -- spi-tx-bus-width - The bus width (number of data wires) that is used for MOSI. - Defaults to 1 if not present. -- spi-rx-bus-width - The bus width (number of data wires) that is used for MISO. - Defaults to 1 if not present. -- spi-rx-delay-us - Microsecond delay after a read transfer. -- spi-tx-delay-us - Microsecond delay after a write transfer. - -Some SPI controllers and devices support Dual and Quad SPI transfer mode. -It allows data in the SPI system to be transferred using 2 wires (DUAL) or 4 -wires (QUAD). -Now the value that spi-tx-bus-width and spi-rx-bus-width can receive is -only 1 (SINGLE), 2 (DUAL) and 4 (QUAD). -Dual/Quad mode is not allowed when 3-wire mode is used. - -If a gpio chipselect is used for the SPI slave the gpio number will be passed -via the SPI master node cs-gpios property. - -SPI example for an MPC5200 SPI bus: - spi@f00 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi"; - reg = <0xf00 0x20>; - interrupts = <2 13 0 2 14 0>; - interrupt-parent = <&mpc5200_pic>; - - ethernet-switch@0 { - compatible = "micrel,ks8995m"; - spi-max-frequency = <1000000>; - reg = <0>; - }; - - codec@1 { - compatible = "ti,tlv320aic26"; - spi-max-frequency = <100000>; - reg = <1>; - }; - }; +This file has moved to spi-controller.yaml. diff --git a/Documentation/devicetree/bindings/spi/spi-controller.yaml b/Documentation/devicetree/bindings/spi/spi-controller.yaml new file mode 100644 index 000000000000..876c0623f322 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-controller.yaml @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/spi-controller.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SPI Controller Generic Binding + +maintainers: + - Mark Brown + +description: | + SPI busses can be described with a node for the SPI controller device + and a set of child nodes for each SPI slave on the bus. The system SPI + controller may be described for use in SPI master mode or in SPI slave mode, + but not for both at the same time. + +properties: + $nodename: + pattern: "^spi(@.*|-[0-9a-f])*$" + + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + cs-gpios: + description: | + GPIOs used as chip selects. + If that property is used, the number of chip selects will be + increased automatically with max(cs-gpios, hardware chip selects). + + So if, for example, the controller has 2 CS lines, and the + cs-gpios looks like this + cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>; + + Then it should be configured so that num_chipselect = 4, with + the following mapping + cs0 : &gpio1 0 0 + cs1 : native + cs2 : &gpio1 1 0 + cs3 : &gpio1 2 0 + + num-cs: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Total number of chip selects. + + spi-slave: + $ref: /schemas/types.yaml#/definitions/flag + description: + The SPI controller acts as a slave, instead of a master. + +patternProperties: + "^slave$": + type: object + + properties: + compatible: + description: + Compatible of the SPI device. + + required: + - compatible + + "^.*@[0-9a-f]+$": + type: object + + properties: + compatible: + description: + Compatible of the SPI device. + + reg: + maxItems: 1 + minimum: 0 + maximum: 256 + description: + Chip select used by the device. + + spi-3wire: + $ref: /schemas/types.yaml#/definitions/flag + description: + The device requires 3-wire mode. + + spi-cpha: + $ref: /schemas/types.yaml#/definitions/flag + description: + The device requires shifted clock phase (CPHA) mode. + + spi-cpol: + $ref: /schemas/types.yaml#/definitions/flag + description: + The device requires inverse clock polarity (CPOL) mode. + + spi-cs-high: + $ref: /schemas/types.yaml#/definitions/flag + description: + The device requires the chip select active high. + + spi-lsb-first: + $ref: /schemas/types.yaml#/definitions/flag + description: + The device requires the LSB first mode. + + spi-max-frequency: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Maximum SPI clocking speed of the device in Hz. + + spi-rx-bus-width: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - enum: [ 1, 2, 4 ] + - default: 1 + description: + Bus width to the SPI bus used for MISO. + + spi-rx-delay-us: + description: + Delay, in microseconds, after a read transfer. + + spi-tx-bus-width: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - enum: [ 1, 2, 4 ] + - default: 1 + description: + Bus width to the SPI bus used for MOSI. + + spi-tx-delay-us: + description: + Delay, in microseconds, after a write transfer. + + required: + - compatible + - reg + +examples: + - | + spi@f00 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi"; + reg = <0xf00 0x20>; + interrupts = <2 13 0 2 14 0>; + interrupt-parent = <&mpc5200_pic>; + + ethernet-switch@0 { + compatible = "micrel,ks8995m"; + spi-max-frequency = <1000000>; + reg = <0>; + }; + + codec@1 { + compatible = "ti,tlv320aic26"; + spi-max-frequency = <100000>; + reg = <1>; + }; + }; -- cgit From 3133f5c24305c31ef2805a7eb05fc18a664836d3 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 20 May 2019 16:50:34 +0200 Subject: spi: sun4i: Add YAML schemas Switch the DT binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/spi/allwinner,sun4i-a10-spi.yaml | 86 ++++++++++++++++++++++ .../devicetree/bindings/spi/spi-sun4i.txt | 23 ------ 2 files changed, 86 insertions(+), 23 deletions(-) create mode 100644 Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml delete mode 100644 Documentation/devicetree/bindings/spi/spi-sun4i.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml new file mode 100644 index 000000000000..c374fd4923a6 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/allwinner,sun4i-a10-spi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 SPI Controller Device Tree Bindings + +allOf: + - $ref: "spi-controller.yaml" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#address-cells": true + "#size-cells": true + + compatible: + const: allwinner,sun4i-a10-spi + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: Bus Clock + - description: Module Clock + + clock-names: + items: + - const: ahb + - const: mod + + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + + num-cs: true + +patternProperties: + "^.*@[0-9a-f]+": + properties: + reg: + items: + minimum: 0 + maximum: 4 + + spi-rx-bus-width: + const: 1 + + spi-tx-bus-width: + const: 1 + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + spi1: spi@1c06000 { + compatible = "allwinner,sun4i-a10-spi"; + reg = <0x01c06000 0x1000>; + interrupts = <11>; + clocks = <&ahb_gates 21>, <&spi1_clk>; + clock-names = "ahb", "mod"; + #address-cells = <1>; + #size-cells = <0>; + }; + +... diff --git a/Documentation/devicetree/bindings/spi/spi-sun4i.txt b/Documentation/devicetree/bindings/spi/spi-sun4i.txt deleted file mode 100644 index c75d604a8290..000000000000 --- a/Documentation/devicetree/bindings/spi/spi-sun4i.txt +++ /dev/null @@ -1,23 +0,0 @@ -Allwinner A10 SPI controller - -Required properties: -- compatible: Should be "allwinner,sun4-a10-spi". -- reg: Should contain register location and length. -- interrupts: Should contain interrupt. -- clocks: phandle to the clocks feeding the SPI controller. Two are - needed: - - "ahb": the gated AHB parent clock - - "mod": the parent module clock -- clock-names: Must contain the clock names described just above - -Example: - -spi1: spi@1c06000 { - compatible = "allwinner,sun4i-a10-spi"; - reg = <0x01c06000 0x1000>; - interrupts = <11>; - clocks = <&ahb_gates 21>, <&spi1_clk>; - clock-names = "ahb", "mod"; - #address-cells = <1>; - #size-cells = <0>; -}; -- cgit From 101e6fce89b4707429185527e15d97c7e8f62ec5 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 20 May 2019 16:50:35 +0200 Subject: spi: sun6i: Add YAML schemas Switch the DT binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/spi/allwinner,sun6i-a31-spi.yaml | 106 +++++++++++++++++++++ .../devicetree/bindings/spi/spi-sun6i.txt | 44 --------- 2 files changed, 106 insertions(+), 44 deletions(-) create mode 100644 Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml delete mode 100644 Documentation/devicetree/bindings/spi/spi-sun6i.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml new file mode 100644 index 000000000000..bda7a5befd8b --- /dev/null +++ b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/allwinner,sun6i-a31-spi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A31 SPI Controller Device Tree Bindings + +allOf: + - $ref: "spi-controller.yaml" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#address-cells": true + "#size-cells": true + + compatible: + enum: + - allwinner,sun6i-a31-spi + - allwinner,sun8i-h3-spi + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: Bus Clock + - description: Module Clock + + clock-names: + items: + - const: ahb + - const: mod + + resets: + maxItems: 1 + + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + + num-cs: true + +patternProperties: + "^.*@[0-9a-f]+": + properties: + reg: + items: + minimum: 0 + maximum: 4 + + spi-rx-bus-width: + const: 1 + + spi-tx-bus-width: + const: 1 + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + spi1: spi@1c69000 { + compatible = "allwinner,sun6i-a31-spi"; + reg = <0x01c69000 0x1000>; + interrupts = <0 66 4>; + clocks = <&ahb1_gates 21>, <&spi1_clk>; + clock-names = "ahb", "mod"; + resets = <&ahb1_rst 21>; + #address-cells = <1>; + #size-cells = <0>; + }; + + - | + spi0: spi@1c68000 { + compatible = "allwinner,sun8i-h3-spi"; + reg = <0x01c68000 0x1000>; + interrupts = <0 65 4>; + clocks = <&ccu 30>, <&ccu 82>; + clock-names = "ahb", "mod"; + dmas = <&dma 23>, <&dma 23>; + dma-names = "rx", "tx"; + resets = <&ccu 15>; + #address-cells = <1>; + #size-cells = <0>; + }; + +... diff --git a/Documentation/devicetree/bindings/spi/spi-sun6i.txt b/Documentation/devicetree/bindings/spi/spi-sun6i.txt deleted file mode 100644 index 435a8e0731ac..000000000000 --- a/Documentation/devicetree/bindings/spi/spi-sun6i.txt +++ /dev/null @@ -1,44 +0,0 @@ -Allwinner A31/H3 SPI controller - -Required properties: -- compatible: Should be "allwinner,sun6i-a31-spi" or "allwinner,sun8i-h3-spi". -- reg: Should contain register location and length. -- interrupts: Should contain interrupt. -- clocks: phandle to the clocks feeding the SPI controller. Two are - needed: - - "ahb": the gated AHB parent clock - - "mod": the parent module clock -- clock-names: Must contain the clock names described just above -- resets: phandle to the reset controller asserting this device in - reset - -Optional properties: -- dmas: DMA specifiers for rx and tx dma. See the DMA client binding, - Documentation/devicetree/bindings/dma/dma.txt -- dma-names: DMA request names should include "rx" and "tx" if present. - -Example: - -spi1: spi@1c69000 { - compatible = "allwinner,sun6i-a31-spi"; - reg = <0x01c69000 0x1000>; - interrupts = <0 66 4>; - clocks = <&ahb1_gates 21>, <&spi1_clk>; - clock-names = "ahb", "mod"; - resets = <&ahb1_rst 21>; -}; - -spi0: spi@1c68000 { - compatible = "allwinner,sun8i-h3-spi"; - reg = <0x01c68000 0x1000>; - interrupts = ; - clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>; - clock-names = "ahb", "mod"; - dmas = <&dma 23>, <&dma 23>; - dma-names = "rx", "tx"; - pinctrl-names = "default"; - pinctrl-0 = <&spi0_pins>; - resets = <&ccu RST_BUS_SPI0>; - #address-cells = <1>; - #size-cells = <0>; -}; -- cgit From 7ef5f7dd2a07f7460ce900818cc4cd8222daeebe Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 9 May 2019 21:34:14 -0700 Subject: dt-bindings: remoteproc: Rename and amend Hexagon v56 binding The SDM845 Audio DSP peripheral image loader binding describes the properties needed to load and boot firmware on a Hexagon v56. Rename the file and add the Compute DSP (CDSP) found in QCS404 to the binding. Reviewed-by: Rob Herring Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson --- .../bindings/remoteproc/qcom,adsp-pil.txt | 125 ------------------ .../bindings/remoteproc/qcom,hexagon-v56.txt | 140 +++++++++++++++++++++ 2 files changed, 140 insertions(+), 125 deletions(-) delete mode 100644 Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt create mode 100644 Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt deleted file mode 100644 index 66af2c30944f..000000000000 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt +++ /dev/null @@ -1,125 +0,0 @@ -Qualcomm Technology Inc. ADSP Peripheral Image Loader - -This document defines the binding for a component that loads and boots firmware -on the Qualcomm Technology Inc. ADSP Hexagon core. - -- compatible: - Usage: required - Value type: - Definition: must be one of: - "qcom,sdm845-adsp-pil" - -- reg: - Usage: required - Value type: - Definition: must specify the base address and size of the qdsp6ss register - -- interrupts-extended: - Usage: required - Value type: - Definition: must list the watchdog, fatal IRQs ready, handover and - stop-ack IRQs - -- interrupt-names: - Usage: required - Value type: - Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack" - -- clocks: - Usage: required - Value type: - Definition: List of 8 phandle and clock specifier pairs for the adsp. - -- clock-names: - Usage: required - Value type: - Definition: List of clock input name strings sorted in the same - order as the clocks property. Definition must have - "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr", - "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep" - and "qdsp6ss_core". - -- power-domains: - Usage: required - Value type: - Definition: reference to cx power domain node. - -- resets: - Usage: required - Value type: - Definition: reference to the list of 2 reset-controller for the adsp. - -- reset-names: - Usage: required - Value type: - Definition: must be "pdc_sync" and "cc_lpass" - -- qcom,halt-regs: - Usage: required - Value type: - Definition: a phandle reference to a syscon representing TCSR followed - by the offset within syscon for lpass halt register. - -- memory-region: - Usage: required - Value type: - Definition: reference to the reserved-memory for the ADSP - -- qcom,smem-states: - Usage: required - Value type: - Definition: reference to the smem state for requesting the ADSP to - shut down - -- qcom,smem-state-names: - Usage: required - Value type: - Definition: must be "stop" - - -= SUBNODES -The adsp node may have an subnode named "glink-edge" that describes the -communication edge, channels and devices related to the ADSP. -See ../soc/qcom/qcom,glink.txt for details on how to describe these. - -= EXAMPLE -The following example describes the resources needed to boot control the -ADSP, as it is found on SDM845 boards. - - remoteproc@17300000 { - compatible = "qcom,sdm845-adsp-pil"; - reg = <0x17300000 0x40c>; - - interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>, - <&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>, - <&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>, - <&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>, - <&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>; - interrupt-names = "wdog", "fatal", "ready", - "handover", "stop-ack"; - - clocks = <&rpmhcc RPMH_CXO_CLK>, - <&gcc GCC_LPASS_SWAY_CLK>, - <&lpasscc LPASS_Q6SS_AHBS_AON_CLK>, - <&lpasscc LPASS_Q6SS_AHBM_AON_CLK>, - <&lpasscc LPASS_QDSP6SS_XO_CLK>, - <&lpasscc LPASS_QDSP6SS_SLEEP_CLK>, - <&lpasscc LPASS_QDSP6SS_CORE_CLK>; - clock-names = "xo", "sway_cbcr", - "lpass_ahbs_aon_cbcr", - "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", - "qdsp6ss_sleep", "qdsp6ss_core"; - - power-domains = <&rpmhpd SDM845_CX>; - - resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>, - <&aoss_reset AOSS_CC_LPASS_RESTART>; - reset-names = "pdc_sync", "cc_lpass"; - - qcom,halt-regs = <&tcsr_mutex_regs 0x22000>; - - memory-region = <&pil_adsp_mem>; - - qcom,smem-states = <&adsp_smp2p_out 0>; - qcom,smem-state-names = "stop"; - }; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt new file mode 100644 index 000000000000..1337a3d93d35 --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt @@ -0,0 +1,140 @@ +Qualcomm Technology Inc. Hexagon v56 Peripheral Image Loader + +This document defines the binding for a component that loads and boots firmware +on the Qualcomm Technology Inc. Hexagon v56 core. + +- compatible: + Usage: required + Value type: + Definition: must be one of: + "qcom,qcs404-cdsp-pil", + "qcom,sdm845-adsp-pil" + +- reg: + Usage: required + Value type: + Definition: must specify the base address and size of the qdsp6ss register + +- interrupts-extended: + Usage: required + Value type: + Definition: must list the watchdog, fatal IRQs ready, handover and + stop-ack IRQs + +- interrupt-names: + Usage: required + Value type: + Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack" + +- clocks: + Usage: required + Value type: + Definition: List of phandles and clock specifier pairs for the Hexagon, + per clock-names below. + +- clock-names: + Usage: required for SDM845 ADSP + Value type: + Definition: List of clock input name strings sorted in the same + order as the clocks property. Definition must have + "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr", + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep" + and "qdsp6ss_core". + +- clock-names: + Usage: required for QCS404 CDSP + Value type: + Definition: List of clock input name strings sorted in the same + order as the clocks property. Definition must have + "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave", + "q6ss_master", "q6_axim". + +- power-domains: + Usage: required + Value type: + Definition: reference to cx power domain node. + +- resets: + Usage: required + Value type: + Definition: reference to the list of resets for the Hexagon. + +- reset-names: + Usage: required for SDM845 ADSP + Value type: + Definition: must be "pdc_sync" and "cc_lpass" + +- reset-names: + Usage: required for QCS404 CDSP + Value type: + Definition: must be "restart" + +- qcom,halt-regs: + Usage: required + Value type: + Definition: a phandle reference to a syscon representing TCSR followed + by the offset within syscon for Hexagon halt register. + +- memory-region: + Usage: required + Value type: + Definition: reference to the reserved-memory for the firmware + +- qcom,smem-states: + Usage: required + Value type: + Definition: reference to the smem state for requesting the Hexagon to + shut down + +- qcom,smem-state-names: + Usage: required + Value type: + Definition: must be "stop" + + += SUBNODES +The adsp node may have an subnode named "glink-edge" that describes the +communication edge, channels and devices related to the Hexagon. +See ../soc/qcom/qcom,glink.txt for details on how to describe these. + += EXAMPLE +The following example describes the resources needed to boot control the +ADSP, as it is found on SDM845 boards. + + remoteproc@17300000 { + compatible = "qcom,sdm845-adsp-pil"; + reg = <0x17300000 0x40c>; + + interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>, + <&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>; + interrupt-names = "wdog", "fatal", "ready", + "handover", "stop-ack"; + + clocks = <&rpmhcc RPMH_CXO_CLK>, + <&gcc GCC_LPASS_SWAY_CLK>, + <&lpasscc LPASS_Q6SS_AHBS_AON_CLK>, + <&lpasscc LPASS_Q6SS_AHBM_AON_CLK>, + <&lpasscc LPASS_QDSP6SS_XO_CLK>, + <&lpasscc LPASS_QDSP6SS_SLEEP_CLK>, + <&lpasscc LPASS_QDSP6SS_CORE_CLK>; + clock-names = "xo", "sway_cbcr", + "lpass_ahbs_aon_cbcr", + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", + "qdsp6ss_sleep", "qdsp6ss_core"; + + power-domains = <&rpmhpd SDM845_CX>; + + resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>, + <&aoss_reset AOSS_CC_LPASS_RESTART>; + reset-names = "pdc_sync", "cc_lpass"; + + qcom,halt-regs = <&tcsr_mutex_regs 0x22000>; + + memory-region = <&pil_adsp_mem>; + + qcom,smem-states = <&adsp_smp2p_out 0>; + qcom,smem-state-names = "stop"; + }; -- cgit From b270ea40b3c6e5d269ae6e1c74db401f7b1dcef0 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Sat, 29 Dec 2018 00:23:00 +0530 Subject: dt-bindings: soc: qcom: Add remote-pid binding for GLINK SMEM Add missing qcom,remote-pid dt binding required for GLINK SMEM which specifies the remote endpoint of the GLINK edge. Fixes: 2b41d6c8e696 ("dt-bindings: soc: qcom: Extend GLINK to cover SMEM") Reviewed-by: Doug Anderson Reviewed-by: Rob Herring Signed-off-by: Sibi Sankar Signed-off-by: Bjorn Andersson --- Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt index cf759e5f9b10..1214192847ac 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt @@ -21,6 +21,11 @@ edge. Definition: should specify the IRQ used by the remote processor to signal this processor about communication related events +- qcom,remote-pid: + Usage: required for glink-smem + Value type: + Definition: specifies the identifier of the remote endpoint of this edge + - qcom,rpm-msg-ram: Usage: required for glink-rpm Value type: -- cgit From 97266c4d05345f9b500d10c3caa1070249e895e7 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 May 2019 16:23:25 -0500 Subject: spi: dt-bindings: Convert spi-gpio binding to json-schema Convert the spi-gpio binding to DT schema format. Cc: Mark Brown Cc: Linus Walleij Cc: linux-spi@vger.kernel.org Signed-off-by: Rob Herring Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/spi/spi-gpio.txt | 43 ------------- .../devicetree/bindings/spi/spi-gpio.yaml | 72 ++++++++++++++++++++++ 2 files changed, 72 insertions(+), 43 deletions(-) delete mode 100644 Documentation/devicetree/bindings/spi/spi-gpio.txt create mode 100644 Documentation/devicetree/bindings/spi/spi-gpio.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.txt b/Documentation/devicetree/bindings/spi/spi-gpio.txt deleted file mode 100644 index 52db562f17a4..000000000000 --- a/Documentation/devicetree/bindings/spi/spi-gpio.txt +++ /dev/null @@ -1,43 +0,0 @@ -SPI-GPIO devicetree bindings - -This represents a group of 3-n GPIO lines used for bit-banged SPI on dedicated -GPIO lines. - -Required properties: - - - compatible: should be set to "spi-gpio" - - #address-cells: should be set to <0x1> - - ranges - - sck-gpios: GPIO spec for the SCK line to use - - miso-gpios: GPIO spec for the MISO line to use - - mosi-gpios: GPIO spec for the MOSI line to use - - cs-gpios: GPIOs to use for chipselect lines. - Not needed if num-chipselects = <0>. - - num-chipselects: Number of chipselect lines. Should be <0> if a single device - with no chip select is connected. - -Deprecated bindings: - -These legacy GPIO line bindings can alternatively be used to define the -GPIO lines used, they should not be used in new device trees. - - - gpio-sck: GPIO spec for the SCK line to use - - gpio-miso: GPIO spec for the MISO line to use - - gpio-mosi: GPIO spec for the MOSI line to use - -Example: - - spi { - compatible = "spi-gpio"; - #address-cells = <0x1>; - ranges; - - sck-gpios = <&gpio 95 0>; - miso-gpios = <&gpio 98 0>; - mosi-gpios = <&gpio 97 0>; - cs-gpios = <&gpio 125 0>; - num-chipselects = <1>; - - /* clients */ - }; - diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.yaml b/Documentation/devicetree/bindings/spi/spi-gpio.yaml new file mode 100644 index 000000000000..55c4f1705f07 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-gpio.yaml @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/spi-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SPI-GPIO devicetree bindings + +maintainers: + - Rob Herring + +description: + This represents a group of 3-n GPIO lines used for bit-banged SPI on + dedicated GPIO lines. + +allOf: + - $ref: "/schemas/spi/spi-controller.yaml#" + +properties: + compatible: + const: spi-gpio + + sck-gpios: + description: GPIO spec for the SCK line to use + maxItems: 1 + + miso-gpios: + description: GPIO spec for the MISO line to use + maxItems: 1 + + mosi-gpios: + description: GPIO spec for the MOSI line to use + maxItems: 1 + + cs-gpios: + description: GPIOs to use for chipselect lines. + Not needed if num-chipselects = <0>. + minItems: 1 + maxItems: 1024 + + num-chipselects: + description: Number of chipselect lines. Should be <0> if a single device + with no chip select is connected. + $ref: "/schemas/types.yaml#/definitions/uint32" + + # Deprecated properties + gpio-sck: false + gpio-miso: false + gpio-mosi: false + +required: + - compatible + - num-chipselects + - sck-gpios + +examples: + - | + spi { + compatible = "spi-gpio"; + #address-cells = <0x1>; + #size-cells = <0x0>; + + sck-gpios = <&gpio 95 0>; + miso-gpios = <&gpio 98 0>; + mosi-gpios = <&gpio 97 0>; + cs-gpios = <&gpio 125 0>; + num-chipselects = <1>; + + /* clients */ + }; + +... -- cgit From 1914a996436b09186489da73b807e1df71259f67 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 May 2019 16:20:29 -0500 Subject: regulator: Convert regulator binding to json-schema Convert the common regulator binding to DT schema format. Note that all the properties with standard unit suffixes have type checks already, so only a description is necessary. As fixed-regulator has already been converted, update the references in it. Otherwise, keep regulator.txt with a reference to the schema to avoid a bunch of treewide updates. regulator.txt can be removed when all the regulator bindings are converted. Signed-off-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/regulator/fixed-regulator.yaml | 5 +- .../devicetree/bindings/regulator/regulator.txt | 140 +-------------- .../devicetree/bindings/regulator/regulator.yaml | 200 +++++++++++++++++++++ 3 files changed, 205 insertions(+), 140 deletions(-) create mode 100644 Documentation/devicetree/bindings/regulator/regulator.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml index d289c2f7455a..a650b457085d 100644 --- a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml @@ -12,10 +12,13 @@ maintainers: description: Any property defined as part of the core regulator binding, defined in - regulator.txt, can also be used. However a fixed voltage regulator is + regulator.yaml, can also be used. However a fixed voltage regulator is expected to have the regulator-min-microvolt and regulator-max-microvolt to be the same. +allOf: + - $ref: "regulator.yaml#" + properties: compatible: const: regulator-fixed diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt index 0a3f087d5844..487ccd8370b3 100644 --- a/Documentation/devicetree/bindings/regulator/regulator.txt +++ b/Documentation/devicetree/bindings/regulator/regulator.txt @@ -1,139 +1 @@ -Voltage/Current Regulators - -Optional properties: -- regulator-name: A string used as a descriptive name for regulator outputs -- regulator-min-microvolt: smallest voltage consumers may set -- regulator-max-microvolt: largest voltage consumers may set -- regulator-microvolt-offset: Offset applied to voltages to compensate for voltage drops -- regulator-min-microamp: smallest current consumers may set -- regulator-max-microamp: largest current consumers may set -- regulator-input-current-limit-microamp: maximum input current regulator allows -- regulator-always-on: boolean, regulator should never be disabled -- regulator-boot-on: bootloader/firmware enabled regulator -- regulator-allow-bypass: allow the regulator to go into bypass mode -- regulator-allow-set-load: allow the regulator performance level to be configured -- -supply: phandle to the parent supply/regulator node -- regulator-ramp-delay: ramp delay for regulator(in uV/us) - For hardware which supports disabling ramp rate, it should be explicitly - initialised to zero (regulator-ramp-delay = <0>) for disabling ramp delay. -- regulator-enable-ramp-delay: The time taken, in microseconds, for the supply - rail to reach the target voltage, plus/minus whatever tolerance the board - design requires. This property describes the total system ramp time - required due to the combination of internal ramping of the regulator itself, - and board design issues such as trace capacitance and load on the supply. -- regulator-settling-time-us: Settling time, in microseconds, for voltage - change if regulator have the constant time for any level voltage change. - This is useful when regulator have exponential voltage change. -- regulator-settling-time-up-us: Settling time, in microseconds, for voltage - increase if the regulator needs a constant time to settle after voltage - increases of any level. This is useful for regulators with exponential - voltage changes. -- regulator-settling-time-down-us: Settling time, in microseconds, for voltage - decrease if the regulator needs a constant time to settle after voltage - decreases of any level. This is useful for regulators with exponential - voltage changes. -- regulator-soft-start: Enable soft start so that voltage ramps slowly -- regulator-state-standby sub-root node for Standby mode - : equivalent with standby Linux sleep state, which provides energy savings - with a relatively quick transition back time. -- regulator-state-mem sub-root node for Suspend-to-RAM mode - : suspend to memory, the device goes to sleep, but all data stored in memory, - only some external interrupt can wake the device. -- regulator-state-disk sub-root node for Suspend-to-DISK mode - : suspend to disk, this state operates similarly to Suspend-to-RAM, - but includes a final step of writing memory contents to disk. -- regulator-state-[mem/disk/standby] node has following common properties: - - regulator-on-in-suspend: regulator should be on in suspend state. - - regulator-off-in-suspend: regulator should be off in suspend state. - - regulator-suspend-min-microvolt: minimum voltage may be set in - suspend state. - - regulator-suspend-max-microvolt: maximum voltage may be set in - suspend state. - - regulator-suspend-microvolt: the default voltage which regulator - would be set in suspend. This property is now deprecated, instead - setting voltage for suspend mode via the API which regulator - driver provides is recommended. - - regulator-changeable-in-suspend: whether the default voltage and - the regulator on/off in suspend can be changed in runtime. - - regulator-mode: operating mode in the given suspend state. - The set of possible operating modes depends on the capabilities of - every hardware so the valid modes are documented on each regulator - device tree binding document. -- regulator-initial-mode: initial operating mode. The set of possible operating - modes depends on the capabilities of every hardware so each device binding - documentation explains which values the regulator supports. -- regulator-allowed-modes: list of operating modes that software is allowed to - configure for the regulator at run-time. Elements may be specified in any - order. The set of possible operating modes depends on the capabilities of - every hardware so each device binding document explains which values the - regulator supports. -- regulator-system-load: Load in uA present on regulator that is not captured by - any consumer request. -- regulator-pull-down: Enable pull down resistor when the regulator is disabled. -- regulator-over-current-protection: Enable over current protection. -- regulator-active-discharge: tristate, enable/disable active discharge of - regulators. The values are: - 0: Disable active discharge. - 1: Enable active discharge. - Absence of this property will leave configuration to default. -- regulator-coupled-with: Regulators with which the regulator - is coupled. The linkage is 2-way - all coupled regulators should be linked - with each other. A regulator should not be coupled with its supplier. -- regulator-coupled-max-spread: Array of maximum spread between voltages of - coupled regulators in microvolts, each value in the array relates to the - corresponding couple specified by the regulator-coupled-with property. -- regulator-max-step-microvolt: Maximum difference between current and target - voltages that can be changed safely in a single step. - -Deprecated properties: -- regulator-compatible: If a regulator chip contains multiple - regulators, and if the chip's binding contains a child node that - describes each regulator, then this property indicates which regulator - this child node is intended to configure. If this property is missing, - the node's name will be used instead. - -Example: - - xyzreg: regulator@0 { - regulator-min-microvolt = <1000000>; - regulator-max-microvolt = <2500000>; - regulator-always-on; - vin-supply = <&vin>; - - regulator-state-mem { - regulator-on-in-suspend; - }; - }; - -Regulator Consumers: -Consumer nodes can reference one or more of its supplies/ -regulators using the below bindings. - -- -supply: phandle to the regulator node - -These are the same bindings that a regulator in the above -example used to reference its own supply, in which case -its just seen as a special case of a regulator being a -consumer itself. - -Example of a consumer device node (mmc) referencing two -regulators (twl_reg1 and twl_reg2), - - twl_reg1: regulator@0 { - ... - ... - ... - }; - - twl_reg2: regulator@1 { - ... - ... - ... - }; - - mmc: mmc@0 { - ... - ... - vmmc-supply = <&twl_reg1>; - vmmcaux-supply = <&twl_reg2>; - }; +This file has moved to regulator.yaml. diff --git a/Documentation/devicetree/bindings/regulator/regulator.yaml b/Documentation/devicetree/bindings/regulator/regulator.yaml new file mode 100644 index 000000000000..02c3043ce419 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/regulator.yaml @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/regulator.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Voltage/Current Regulators + +maintainers: + - Liam Girdwood + - Mark Brown + +properties: + regulator-name: + description: A string used as a descriptive name for regulator outputs + $ref: "/schemas/types.yaml#/definitions/string" + + regulator-min-microvolt: + description: smallest voltage consumers may set + + regulator-max-microvolt: + description: largest voltage consumers may set + + regulator-microvolt-offset: + description: Offset applied to voltages to compensate for voltage drops + + regulator-min-microamp: + description: smallest current consumers may set + + regulator-max-microamp: + description: largest current consumers may set + + regulator-input-current-limit-microamp: + description: maximum input current regulator allows + + regulator-always-on: + description: boolean, regulator should never be disabled + type: boolean + + regulator-boot-on: + description: bootloader/firmware enabled regulator + type: boolean + + regulator-allow-bypass: + description: allow the regulator to go into bypass mode + type: boolean + + regulator-allow-set-load: + description: allow the regulator performance level to be configured + type: boolean + + regulator-ramp-delay: + description: ramp delay for regulator(in uV/us) For hardware which supports + disabling ramp rate, it should be explicitly initialised to zero (regulator-ramp-delay + = <0>) for disabling ramp delay. + $ref: "/schemas/types.yaml#/definitions/uint32" + + regulator-enable-ramp-delay: + description: The time taken, in microseconds, for the supply rail to + reach the target voltage, plus/minus whatever tolerance the board + design requires. This property describes the total system ramp time + required due to the combination of internal ramping of the regulator + itself, and board design issues such as trace capacitance and load + on the supply. + $ref: "/schemas/types.yaml#/definitions/uint32" + + regulator-settling-time-us: + description: Settling time, in microseconds, for voltage change if regulator + have the constant time for any level voltage change. This is useful + when regulator have exponential voltage change. + + regulator-settling-time-up-us: + description: Settling time, in microseconds, for voltage increase if + the regulator needs a constant time to settle after voltage increases + of any level. This is useful for regulators with exponential voltage + changes. + + regulator-settling-time-down-us: + description: Settling time, in microseconds, for voltage decrease if + the regulator needs a constant time to settle after voltage decreases + of any level. This is useful for regulators with exponential voltage + changes. + + regulator-soft-start: + description: Enable soft start so that voltage ramps slowly + type: boolean + + regulator-initial-mode: + description: initial operating mode. The set of possible operating modes + depends on the capabilities of every hardware so each device binding + documentation explains which values the regulator supports. + $ref: "/schemas/types.yaml#/definitions/uint32" + + regulator-allowed-modes: + description: list of operating modes that software is allowed to configure + for the regulator at run-time. Elements may be specified in any order. + The set of possible operating modes depends on the capabilities of + every hardware so each device binding document explains which values + the regulator supports. + $ref: "/schemas/types.yaml#/definitions/uint32-array" + + regulator-system-load: + description: Load in uA present on regulator that is not captured by + any consumer request. + $ref: "/schemas/types.yaml#/definitions/uint32" + + regulator-pull-down: + description: Enable pull down resistor when the regulator is disabled. + type: boolean + + regulator-over-current-protection: + description: Enable over current protection. + type: boolean + + regulator-active-discharge: + description: | + tristate, enable/disable active discharge of regulators. The values are: + 0: Disable active discharge. + 1: Enable active discharge. + Absence of this property will leave configuration to default. + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - enum: [ 0, 1 ] + + regulator-coupled-with: + description: Regulators with which the regulator is coupled. The linkage + is 2-way - all coupled regulators should be linked with each other. + A regulator should not be coupled with its supplier. + $ref: "/schemas/types.yaml#/definitions/phandle-array" + + regulator-coupled-max-spread: + description: Array of maximum spread between voltages of coupled regulators + in microvolts, each value in the array relates to the corresponding + couple specified by the regulator-coupled-with property. + $ref: "/schemas/types.yaml#/definitions/uint32" + + regulator-max-step-microvolt: + description: Maximum difference between current and target voltages + that can be changed safely in a single step. + +patternProperties: + ".*-supply$": + description: Input supply phandle(s) for this node + + regulator-state-(standby|mem|disk): + type: object + description: + sub-nodes for regulator state in Standby, Suspend-to-RAM, and + Suspend-to-DISK modes. Equivalent with standby, mem, and disk Linux + sleep states. + + properties: + regulator-on-in-suspend: + description: regulator should be on in suspend state. + type: boolean + + regulator-off-in-suspend: + description: regulator should be off in suspend state. + type: boolean + + regulator-suspend-min-microvolt: + description: minimum voltage may be set in suspend state. + + regulator-suspend-max-microvolt: + description: maximum voltage may be set in suspend state. + + regulator-suspend-microvolt: + description: the default voltage which regulator would be set in + suspend. This property is now deprecated, instead setting voltage + for suspend mode via the API which regulator driver provides is + recommended. + + regulator-changeable-in-suspend: + description: whether the default voltage and the regulator on/off + in suspend can be changed in runtime. + type: boolean + + regulator-mode: + description: operating mode in the given suspend state. The set + of possible operating modes depends on the capabilities of every + hardware so the valid modes are documented on each regulator device + tree binding document. + $ref: "/schemas/types.yaml#/definitions/uint32" + + additionalProperties: false + +examples: + - | + xyzreg: regulator@0 { + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <2500000>; + regulator-always-on; + vin-supply = <&vin>; + + regulator-state-mem { + regulator-on-in-suspend; + }; + }; + +... -- cgit From 673e401effe9dfd655f2d16588248f4c043361ce Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 May 2019 16:20:30 -0500 Subject: regulator: Convert gpio-regulator to json-schema Convert the gpio-regulator binding to DT schema format using json-schema. Signed-off-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/regulator/gpio-regulator.txt | 57 ---------- .../bindings/regulator/gpio-regulator.yaml | 118 +++++++++++++++++++++ 2 files changed, 118 insertions(+), 57 deletions(-) delete mode 100644 Documentation/devicetree/bindings/regulator/gpio-regulator.txt create mode 100644 Documentation/devicetree/bindings/regulator/gpio-regulator.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt b/Documentation/devicetree/bindings/regulator/gpio-regulator.txt deleted file mode 100644 index dd25e73b5d79..000000000000 --- a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt +++ /dev/null @@ -1,57 +0,0 @@ -GPIO controlled regulators - -Required properties: -- compatible : Must be "regulator-gpio". -- regulator-name : Defined in regulator.txt as optional, but required - here. -- gpios : Array of one or more GPIO pins used to select the - regulator voltage/current listed in "states". -- states : Selection of available voltages/currents provided by - this regulator and matching GPIO configurations to - achieve them. If there are no states in the "states" - array, use a fixed regulator instead. - -Optional properties: -- enable-gpios : GPIO used to enable/disable the regulator. - Warning, the GPIO phandle flags are ignored and the - GPIO polarity is controlled solely by the presence - of "enable-active-high" DT property. This is due to - compatibility with old DTs. -- enable-active-high : Polarity of "enable-gpio" GPIO is active HIGH. - Default is active LOW. -- gpios-states : On operating systems, that don't support reading back - gpio values in output mode (most notably linux), this - array provides the state of GPIO pins set when - requesting them from the gpio controller. Systems, - that are capable of preserving state when requesting - the lines, are free to ignore this property. - 0: LOW, 1: HIGH. Default is LOW if nothing else - is specified. -- startup-delay-us : Startup time in microseconds. -- regulator-type : Specifies what is being regulated, must be either - "voltage" or "current", defaults to voltage. - -Any property defined as part of the core regulator binding defined in -regulator.txt can also be used. - -Example: - - mmciv: gpio-regulator { - compatible = "regulator-gpio"; - - regulator-name = "mmci-gpio-supply"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <2600000>; - regulator-boot-on; - - enable-gpios = <&gpio0 23 0x4>; - gpios = <&gpio0 24 0x4 - &gpio0 25 0x4>; - states = <1800000 0x3 - 2200000 0x2 - 2600000 0x1 - 2900000 0x0>; - - startup-delay-us = <100000>; - enable-active-high; - }; diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml new file mode 100644 index 000000000000..9d3b28417fb6 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/gpio-regulator.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: GPIO controlled regulators + +maintainers: + - Liam Girdwood + - Mark Brown + +description: + Any property defined as part of the core regulator binding, defined in + regulator.txt, can also be used. + +allOf: + - $ref: "regulator.yaml#" + +properties: + compatible: + const: regulator-gpio + + regulator-name: true + + enable-gpios: + description: GPIO to use to enable/disable the regulator. + Warning, the GPIO phandle flags are ignored and the GPIO polarity is + controlled solely by the presence of "enable-active-high" DT property. + This is due to compatibility with old DTs. + maxItems: 1 + + gpios: + description: Array of one or more GPIO pins used to select the regulator + voltage/current listed in "states". + minItems: 1 + maxItems: 8 # Should be enough... + + gpios-states: + description: | + On operating systems, that don't support reading back gpio values in + output mode (most notably linux), this array provides the state of GPIO + pins set when requesting them from the gpio controller. Systems, that are + capable of preserving state when requesting the lines, are free to ignore + this property. + 0: LOW + 1: HIGH + Default is LOW if nothing else is specified. + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32-array + - maxItems: 8 + items: + enum: [ 0, 1 ] + default: 0 + + states: + description: Selection of available voltages/currents provided by this + regulator and matching GPIO configurations to achieve them. If there are + no states in the "states" array, use a fixed regulator instead. + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32-matrix + - maxItems: 8 + items: + items: + - description: Voltage in microvolts + - description: GPIO group state value + + startup-delay-us: + description: startup time in microseconds + + enable-active-high: + description: Polarity of "enable-gpio" GPIO is active HIGH. Default is + active LOW. + type: boolean + + gpio-open-drain: + description: + GPIO is open drain type. If this property is missing then default + assumption is false. + type: boolean + + regulator-type: + description: Specifies what is being regulated. + allOf: + - $ref: /schemas/types.yaml#/definitions/string + - enum: + - voltage + - current + default: voltage + +required: + - compatible + - regulator-name + - gpios + - states + +examples: + - | + gpio-regulator { + compatible = "regulator-gpio"; + + regulator-name = "mmci-gpio-supply"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <2600000>; + regulator-boot-on; + + enable-gpios = <&gpio0 23 0x4>; + gpios = <&gpio0 24 0x4 + &gpio0 25 0x4>; + states = <1800000 0x3>, + <2200000 0x2>, + <2600000 0x1>, + <2900000 0x0>; + + startup-delay-us = <100000>; + enable-active-high; + }; +... -- cgit From 27b1b58fcfe72fd6b53a83b0dccb678e6b6faab8 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 May 2019 16:20:31 -0500 Subject: regulator: Convert max8660 binding to json-schema Convert the max8660 binding to DT schema format using json-schema. Signed-off-by: Rob Herring Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/max8660.txt | 47 ------------- .../devicetree/bindings/regulator/max8660.yaml | 77 ++++++++++++++++++++++ 2 files changed, 77 insertions(+), 47 deletions(-) delete mode 100644 Documentation/devicetree/bindings/regulator/max8660.txt create mode 100644 Documentation/devicetree/bindings/regulator/max8660.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/regulator/max8660.txt b/Documentation/devicetree/bindings/regulator/max8660.txt deleted file mode 100644 index 8ba994d8a142..000000000000 --- a/Documentation/devicetree/bindings/regulator/max8660.txt +++ /dev/null @@ -1,47 +0,0 @@ -Maxim MAX8660 voltage regulator - -Required properties: -- compatible: must be one of "maxim,max8660", "maxim,max8661" -- reg: I2C slave address, usually 0x34 -- any required generic properties defined in regulator.txt - -Example: - - i2c_master { - max8660@34 { - compatible = "maxim,max8660"; - reg = <0x34>; - - regulators { - regulator@0 { - regulator-compatible= "V3(DCDC)"; - regulator-min-microvolt = <725000>; - regulator-max-microvolt = <1800000>; - }; - - regulator@1 { - regulator-compatible= "V4(DCDC)"; - regulator-min-microvolt = <725000>; - regulator-max-microvolt = <1800000>; - }; - - regulator@2 { - regulator-compatible= "V5(LDO)"; - regulator-min-microvolt = <1700000>; - regulator-max-microvolt = <2000000>; - }; - - regulator@3 { - regulator-compatible= "V6(LDO)"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <3300000>; - }; - - regulator@4 { - regulator-compatible= "V7(LDO)"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <3300000>; - }; - }; - }; - }; diff --git a/Documentation/devicetree/bindings/regulator/max8660.yaml b/Documentation/devicetree/bindings/regulator/max8660.yaml new file mode 100644 index 000000000000..9c038698f880 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/max8660.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/max8660.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim MAX8660 voltage regulator + +maintainers: + - Daniel Mack + +properties: + $nodename: + pattern: "pmic@[0-9a-f]{1,2}" + compatible: + enum: + - maxim,max8660 + - maxim,max8661 + + reg: + maxItems: 1 + + regulators: + type: object + + patternProperties: + "regulator-.+": + $ref: "regulator.yaml#" + + additionalProperties: false + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@34 { + compatible = "maxim,max8660"; + reg = <0x34>; + + regulators { + regulator-V3 { + regulator-compatible= "V3(DCDC)"; + regulator-min-microvolt = <725000>; + regulator-max-microvolt = <1800000>; + }; + + regulator-V4 { + regulator-compatible= "V4(DCDC)"; + regulator-min-microvolt = <725000>; + regulator-max-microvolt = <1800000>; + }; + + regulator-V5 { + regulator-compatible= "V5(LDO)"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <2000000>; + }; + + regulator-V6 { + regulator-compatible= "V6(LDO)"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + }; + + regulator-V7 { + regulator-compatible= "V7(LDO)"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + }; + }; + }; + }; +... -- cgit From 9c3f3410f5ace868c7c82fe4479fd36bed5eee64 Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Wed, 22 May 2019 18:43:19 +0000 Subject: dt-bindings: phy: dp83867: Describe how driver behaves w.r.t rgmii delay Add a note to make it more clear how the driver behaves when "rgmii" vs "rgmii-id", "rgmii-idrx", or "rgmii-idtx" interface modes are selected. Cc: Rob Herring Cc: Mark Rutland Signed-off-by: Trent Piepho Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ti,dp83867.txt | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt index 9ef9338aaee1..99b8681bde49 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83867.txt +++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt @@ -11,6 +11,14 @@ Required properties: - ti,fifo-depth - Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h for applicable values +Note: If the interface type is PHY_INTERFACE_MODE_RGMII the TX/RX clock delays + will be left at their default values, as set by the PHY's pin strapping. + The default strapping will use a delay of 2.00 ns. Thus + PHY_INTERFACE_MODE_RGMII, by default, does not behave as RGMII with no + internal delay, but as PHY_INTERFACE_MODE_RGMII_ID. The device tree + should use "rgmii-id" if internal delays are desired as this may be + changed in future to cause "rgmii" mode to disable delays. + Optional property: - ti,min-output-impedance - MAC Interface Impedance control to set the programmable output impedance to -- cgit From 980066e6d9642fa5854bed8e592b1a30ea885b76 Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Wed, 22 May 2019 18:43:21 +0000 Subject: dt-bindings: phy: dp83867: Add documentation for disabling clock output The clock output is generally only used for testing and development and not used to daisy-chain PHYs. It's just a source of RF noise afterward. Add a mux value for "off". I've added it as another enumeration to the output property. In the actual PHY, the mux and the output enable are independently controllable. However, it doesn't seem useful to be able to describe the mux setting when the output is disabled. Document that PHY's default setting will be left as is if the property is omitted. Cc: Rob Herring Cc: Mark Rutland Signed-off-by: Trent Piepho Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ti,dp83867.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt index 99b8681bde49..db6aa3f2215b 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83867.txt +++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt @@ -33,8 +33,10 @@ Optional property: software needs to take when this pin is strapped in these modes. See data manual for details. - - ti,clk-output-sel - Muxing option for CLK_OUT pin - see dt-bindings/net/ti-dp83867.h - for applicable values. + - ti,clk-output-sel - Muxing option for CLK_OUT pin. See dt-bindings/net/ti-dp83867.h + for applicable values. The CLK_OUT pin can also + be disabled by this property. When omitted, the + PHY's default will be left as is. Note: ti,min-output-impedance and ti,max-output-impedance are mutually exclusive. When both properties are present ti,max-output-impedance -- cgit From a0b2ff531582c510af11b6d548732eb2fd7772ff Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 20 May 2019 21:43:49 +0200 Subject: dt-bindings: soc: amlogic: canvas: document support for Meson8/8b/8m2 The canvas IP on Meson8, Meson8b and Meson8m2 is similar to the one found on GXBB and newer. The only known difference is that the older SoCs cannot configure the "endianness". Add a compatible string for each of the older SoCs to make sure we won't be using unsupported features on these SoCs. Signed-off-by: Martin Blumenstingl Acked-by: Maxime Jourdan Signed-off-by: Kevin Hilman --- .../devicetree/bindings/soc/amlogic/amlogic,canvas.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt index 436d2106e80d..e876f3ce54f6 100644 --- a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt +++ b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt @@ -2,8 +2,8 @@ Amlogic Canvas ================================ A canvas is a collection of metadata that describes a pixel buffer. -Those metadata include: width, height, phyaddr, wrapping, block mode -and endianness. +Those metadata include: width, height, phyaddr, wrapping and block mode. +Starting with GXBB the endianness can also be described. Many IPs within Amlogic SoCs rely on canvas indexes to read/write pixel data rather than use the phy addresses directly. For instance, this is the case for @@ -18,7 +18,11 @@ Video Lookup Table -------------------------- Required properties: -- compatible: "amlogic,canvas" +- compatible: has to be one of: + - "amlogic,meson8-canvas", "amlogic,canvas" on Meson8 + - "amlogic,meson8b-canvas", "amlogic,canvas" on Meson8b + - "amlogic,meson8m2-canvas", "amlogic,canvas" on Meson8m2 + - "amlogic,canvas" on GXBB and newer - reg: Base physical address and size of the canvas registers. Example: -- cgit From 3e75b76f0f17194e0e65694ade6e69fc11593190 Mon Sep 17 00:00:00 2001 From: Guillaume La Roque Date: Tue, 14 May 2019 10:26:47 +0200 Subject: dt-bindings: pinctrl: add a 'drive-strength-microamp' property This property allow drive-strength parameter in uA instead of mA. Signed-off-by: Guillaume La Roque Acked-by: Martin Blumenstingl Reviewed-by: Martin Blumenstingl Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt index cef2b5855d60..fcd37e93ed4d 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt @@ -258,6 +258,7 @@ drive-push-pull - drive actively high and low drive-open-drain - drive with open drain drive-open-source - drive with open source drive-strength - sink or source at most X mA +drive-strength-microamp - sink or source at most X uA input-enable - enable input on pin (no effect on output, such as enabling an input buffer) input-disable - disable input on pin (no effect on output, such as @@ -326,6 +327,8 @@ arguments are described below. - drive-strength takes as argument the target strength in mA. +- drive-strength-microamp takes as argument the target strength in uA. + - input-debounce takes the debounce time in usec as argument or 0 to disable debouncing -- cgit From 013786c043298710887e983ac8d59aaff1f554f3 Mon Sep 17 00:00:00 2001 From: Guillaume La Roque Date: Tue, 14 May 2019 10:26:49 +0200 Subject: dt-bindings: pinctrl: meson: Add drive-strength-microamp property Add optional drive-strength-microamp property Signed-off-by: Guillaume La Roque Reviewed-by: Martin Blumenstingl Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt index a47dd990a8d3..a7618605bf1e 100644 --- a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt @@ -51,6 +51,10 @@ Configuration nodes support the generic properties "bias-disable", "bias-pull-up" and "bias-pull-down", described in file pinctrl-bindings.txt +Optional properties : + - drive-strength-microamp: Drive strength for the specified pins in uA. + This property is only valid for G12A and newer. + === Example === pinctrl: pinctrl@c1109880 { -- cgit From f3fbedabb7be0d19b2da862fa6c01d82ac39c716 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 9 May 2019 13:59:53 -0700 Subject: dt-bindings: pinctrl: bcm2835-gpio: Document BCM7211 compatible BCM7211 has a slightly different block layout and some additional GPIO registers that were added, document the compatible string. Signed-off-by: Florian Fainelli Reviewed-by: Eric Anholt Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt index 3fac0a061bcc..ac6d614d74e0 100644 --- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt @@ -5,6 +5,9 @@ controller, and pinmux/control device. Required properties: - compatible: "brcm,bcm2835-gpio" +- compatible: should be one of: + "brcm,bcm2835-gpio" - BCM2835 compatible pinctrl + "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl - reg: Should contain the physical address of the GPIO module's registers. - gpio-controller: Marks the device node as a GPIO controller. - #gpio-cells : Should be two. The first cell is the pin number and the -- cgit From 7745f03eb39587dd15a1fb26e6223678b8e906d2 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 13 May 2019 13:58:45 -0400 Subject: x86/topology: Add CPUID.1F multi-die/package support Some new systems have multiple software-visible die within each package. Update Linux parsing of the Intel CPUID "Extended Topology Leaf" to handle either CPUID.B, or the new CPUID.1F. Add cpuinfo_x86.die_id and cpuinfo_x86.max_dies to store the result. die_id will be non-zero only for multi-die/package systems. Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: linux-doc@vger.kernel.org Link: https://lkml.kernel.org/r/7b23d2d26d717b8e14ba137c94b70943f1ae4b5c.1557769318.git.len.brown@intel.com --- Documentation/x86/topology.rst | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Documentation') diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst index 6e28dbe818ab..8e9704f61017 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/x86/topology.rst @@ -49,6 +49,10 @@ Package-related topology information in the kernel: The number of cores in a package. This information is retrieved via CPUID. + - cpuinfo_x86.x86_max_dies: + + The number of dies in a package. This information is retrieved via CPUID. + - cpuinfo_x86.phys_proc_id: The physical ID of the package. This information is retrieved via CPUID -- cgit From 0e344d8c709fe01d882fc0fb5452bedfe5eba67a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 13 May 2019 13:58:47 -0400 Subject: cpu/topology: Export die_id Export die_id in cpu topology, for the benefit of hardware that has multiple-die/package. Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: linux-doc@vger.kernel.org Link: https://lkml.kernel.org/r/e7d1caaf4fbd24ee40db6d557ab28d7d83298900.1557769318.git.len.brown@intel.com --- Documentation/cputopology.txt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index cb61277e2308..2ff8a1e9a2db 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -12,6 +12,12 @@ physical_package_id: socket number, but the actual value is architecture and platform dependent. +die_id: + + the CPU die ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + core_id: the CPU core ID of cpuX. Typically it is the hardware platform's @@ -81,6 +87,7 @@ For an architecture to support this feature, it must define some of these macros in include/asm-XXX/topology.h:: #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) #define topology_core_id(cpu) #define topology_book_id(cpu) #define topology_drawer_id(cpu) @@ -99,9 +106,11 @@ provides default definitions for any of the above macros that are not defined by include/asm-XXX/topology.h: 1) topology_physical_package_id: -1 -2) topology_core_id: 0 -3) topology_sibling_cpumask: just the given CPU -4) topology_core_cpumask: just the given CPU +2) topology_die_id: -1 +3) topology_core_id: 0 +4) topology_sibling_cpumask: just the given CPU +5) topology_core_cpumask: just the given CPU +6) topology_die_cpumask: just the given CPU For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). -- cgit From b73ed8dc0597c11ec5064d06b9bbd4e541b6d4e7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 13 May 2019 13:58:55 -0400 Subject: topology: Create package_cpus sysfs attribute The existing sysfs cpu/topology/core_siblings (and core_siblings_list) attributes are documented, implemented, and used by programs to represent set of logical CPUs sharing the same package. This makes sense if the next topology level above a core is always a package. But on systems where there is a die topology level between a core and a package, the name and its definition become inconsistent. So without changing its function, add a name for this map that describes what it actually is -- package CPUs -- the set of CPUs that share the same package. This new name will be immune to changes in topology, since it describes threads at the current level, not siblings at a contained level. Suggested-by: Brice Goglin Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/d9d3228b82fb5665e6f93a0ccd033fe022558521.1557769318.git.len.brown@intel.com --- Documentation/cputopology.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index 2ff8a1e9a2db..48af5c290e20 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -46,15 +46,15 @@ thread_siblings_list: human-readable list of cpuX's hardware threads within the same core as cpuX. -core_siblings: +package_cpus: - internal kernel map of cpuX's hardware threads within the same - physical_package_id. + internal kernel map of the CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings") -core_siblings_list: +package_cpus_list: - human-readable list of cpuX's hardware threads within the same - physical_package_id. + human-readable list of CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings_list") book_siblings: -- cgit From 2e4c54dac7b360c3820399bdf06cde9134a4495b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 13 May 2019 13:58:56 -0400 Subject: topology: Create core_cpus and die_cpus sysfs attributes Create CPU topology sysfs attributes: "core_cpus" and "core_cpus_list" These attributes represent all of the logical CPUs that share the same core. These attriutes is synonymous with the existing "thread_siblings" and "thread_siblings_list" attribute, which will be deprecated. Create CPU topology sysfs attributes: "die_cpus" and "die_cpus_list". These attributes represent all of the logical CPUs that share the same die. Suggested-by: Brice Goglin Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/071c23a298cd27ede6ed0b6460cae190d193364f.1557769318.git.len.brown@intel.com --- Documentation/cputopology.txt | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index 48af5c290e20..b90dafcc8237 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -36,15 +36,15 @@ drawer_id: identifier (rather than the kernel's). The actual value is architecture and platform dependent. -thread_siblings: +core_cpus: - internal kernel map of cpuX's hardware threads within the same - core as cpuX. + internal kernel map of CPUs within the same core. + (deprecated name: "thread_siblings") -thread_siblings_list: +core_cpus_list: - human-readable list of cpuX's hardware threads within the same - core as cpuX. + human-readable list of CPUs within the same core. + (deprecated name: "thread_siblings_list"); package_cpus: @@ -56,6 +56,14 @@ package_cpus_list: human-readable list of CPUs sharing the same physical_package_id. (deprecated name: "core_siblings_list") +die_cpus: + + internal kernel map of CPUs within the same die. + +die_cpus_list: + + human-readable list of CPUs within the same die. + book_siblings: internal kernel map of cpuX's hardware threads within the same @@ -93,6 +101,7 @@ these macros in include/asm-XXX/topology.h:: #define topology_drawer_id(cpu) #define topology_sibling_cpumask(cpu) #define topology_core_cpumask(cpu) + #define topology_die_cpumask(cpu) #define topology_book_cpumask(cpu) #define topology_drawer_cpumask(cpu) -- cgit From eabe3bc2689ad0993c57469eb7efb1291443cc74 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 15 May 2019 03:50:41 -0400 Subject: media: cec-ioc-receive.rst: document CEC_MSG_FL_RAW Document this new cec_msg flag. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/cec/cec-ioc-receive.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/media/uapi/cec/cec-ioc-receive.rst b/Documentation/media/uapi/cec/cec-ioc-receive.rst index c3a685ff05cb..4137903d672e 100644 --- a/Documentation/media/uapi/cec/cec-ioc-receive.rst +++ b/Documentation/media/uapi/cec/cec-ioc-receive.rst @@ -223,6 +223,18 @@ View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV'). result of the :ref:`ioctl CEC_TRANSMIT `, and once via :ref:`ioctl CEC_RECEIVE `. + * .. _`CEC-MSG-FL-RAW`: + + - ``CEC_MSG_FL_RAW`` + - 2 + - Normally CEC messages are validated before transmitting them. If this + flag is set when :ref:`ioctl CEC_TRANSMIT ` is called, + then no validation takes place and the message is transmitted as-is. + This is useful when debugging CEC issues. + This flag is only allowed if the process has the ``CAP_SYS_RAWIO`` + capability. If that is not set, then the ``EPERM`` error code is + returned. + .. tabularcolumns:: |p{5.6cm}|p{0.9cm}|p{11.0cm}| @@ -358,7 +370,8 @@ ENOTTY EPERM The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS ` - has never been called. + has never been called, or ``CEC_MSG_FL_RAW`` was used from a process that + did not have the ``CAP_SYS_RAWIO`` capability. ENONET The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS ` -- cgit From 428d3c867df6f2f87721fe7384ea09ad3130f839 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 22 May 2019 05:22:20 -0400 Subject: media: cec-ioc-g-mode.rst: be more specific when EPERM is returned Document which capability is required, rather than just saying that "root permissions" are required. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/cec/cec-ioc-g-mode.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst index c53bb5f73f0d..d0902f356d65 100644 --- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst +++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst @@ -294,7 +294,8 @@ EINVAL The requested mode is invalid. EPERM - Monitor mode is requested without having root permissions + Monitor mode is requested, but the process does have the ``CAP_NET_ADMIN`` + capability. EBUSY Someone else is already an exclusive follower or initiator. -- cgit From cc0f6e96c4fd01fe1f935014c8c87ac6e994324c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 May 2019 16:23:24 -0500 Subject: spi: dt-bindings: Convert Arm pl022 to json-schema Convert the Arm pl022 binding to DT schema format. The clock binding was missing, so it is added to the schema. It really should be required as well, but there are some platforms (spear) not yet using DT clock binding. Cc: Mark Brown Cc: Linus Walleij Cc: linux-spi@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Rob Herring Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/spi-pl022.yaml | 165 +++++++++++++++++++++ .../devicetree/bindings/spi/spi_pl022.txt | 70 --------- 2 files changed, 165 insertions(+), 70 deletions(-) create mode 100644 Documentation/devicetree/bindings/spi/spi-pl022.yaml delete mode 100644 Documentation/devicetree/bindings/spi/spi_pl022.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/spi-pl022.yaml b/Documentation/devicetree/bindings/spi/spi-pl022.yaml new file mode 100644 index 000000000000..dfb697c69341 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-pl022.yaml @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/spi-pl022.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ARM PL022 SPI controller + +maintainers: + - Linus Walleij + +allOf: + - $ref: "spi-controller.yaml#" + +# We need a select here so we don't match all nodes with 'arm,primecell' +select: + properties: + compatible: + contains: + const: arm,pl022 + required: + - compatible + +properties: + compatible: + items: + - const: arm,pl022 + - const: arm,primecell + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 2 + + clock-names: + items: + - enum: + - SSPCLK + - sspclk + - const: apb_pclk + + pl022,autosuspend-delay: + description: delay in ms following transfer completion before the + runtime power management system suspends the device. A setting of 0 + indicates no delay and the device will be suspended immediately. + $ref: "/schemas/types.yaml#/definitions/uint32" + + pl022,rt: + description: indicates the controller should run the message pump with realtime + priority to minimise the transfer latency on the bus (boolean) + type: boolean + + dmas: + description: + Two or more DMA channel specifiers following the convention outlined + in bindings/dma/dma.txt + minItems: 2 + maxItems: 32 + + dma-names: + description: + There must be at least one channel named "tx" for transmit and named "rx" + for receive. + minItems: 2 + maxItems: 32 + additionalItems: true + items: + - const: rx + - const: tx + +patternProperties: + "^[a-zA-Z][a-zA-Z0-9,+\\-._]{0,63}@[0-9a-f]+$": + type: object + # SPI slave nodes must be children of the SPI master node and can + # contain the following properties. + properties: + pl022,interface: + description: SPI interface type + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - enum: + - 0 # SPI + - 1 # Texas Instruments Synchronous Serial Frame Format + - 2 # Microwire (Half Duplex) + + pl022,com-mode: + description: Specifies the transfer mode + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - enum: + - 0 # interrupt mode + - 1 # polling mode + - 2 # DMA mode + default: 1 + + pl022,rx-level-trig: + description: Rx FIFO watermark level + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - minimum: 0 + maximum: 4 + + pl022,tx-level-trig: + description: Tx FIFO watermark level + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - minimum: 0 + maximum: 4 + + pl022,ctrl-len: + description: Microwire interface - Control length + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - minimum: 0x03 + maximum: 0x1f + + pl022,wait-state: + description: Microwire interface - Wait state + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - enum: [ 0, 1 ] + + pl022,duplex: + description: Microwire interface - Full/Half duplex + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - enum: [ 0, 1 ] + +required: + - compatible + - reg + - interrupts + +examples: + - | + spi@e0100000 { + compatible = "arm,pl022", "arm,primecell"; + reg = <0xe0100000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = <0 31 0x4>; + dmas = <&dma_controller 23 1>, + <&dma_controller 24 0>; + dma-names = "rx", "tx"; + + m25p80@1 { + compatible = "st,m25p80"; + reg = <1>; + spi-max-frequency = <12000000>; + spi-cpol; + spi-cpha; + pl022,interface = <0>; + pl022,com-mode = <0x2>; + pl022,rx-level-trig = <0>; + pl022,tx-level-trig = <0>; + pl022,ctrl-len = <0x11>; + pl022,wait-state = <0>; + pl022,duplex = <0>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/spi/spi_pl022.txt b/Documentation/devicetree/bindings/spi/spi_pl022.txt deleted file mode 100644 index 7638b4968ddb..000000000000 --- a/Documentation/devicetree/bindings/spi/spi_pl022.txt +++ /dev/null @@ -1,70 +0,0 @@ -ARM PL022 SPI controller - -Required properties: -- compatible : "arm,pl022", "arm,primecell" -- reg : Offset and length of the register set for the device -- interrupts : Should contain SPI controller interrupt -- num-cs : total number of chipselects - -Optional properties: -- cs-gpios : should specify GPIOs used for chipselects. - The gpios will be referred to as reg = in the SPI child nodes. - If unspecified, a single SPI device without a chip select can be used. -- pl022,autosuspend-delay : delay in ms following transfer completion before - the runtime power management system suspends the - device. A setting of 0 indicates no delay and the - device will be suspended immediately -- pl022,rt : indicates the controller should run the message pump with realtime - priority to minimise the transfer latency on the bus (boolean) -- dmas : Two or more DMA channel specifiers following the convention outlined - in bindings/dma/dma.txt -- dma-names: Names for the dma channels, if present. There must be at - least one channel named "tx" for transmit and named "rx" for - receive. - - -SPI slave nodes must be children of the SPI master node and can -contain the following properties. - -- pl022,interface : interface type: - 0: SPI - 1: Texas Instruments Synchronous Serial Frame Format - 2: Microwire (Half Duplex) -- pl022,com-mode : specifies the transfer mode: - 0: interrupt mode - 1: polling mode (default mode if property not present) - 2: DMA mode -- pl022,rx-level-trig : Rx FIFO watermark level -- pl022,tx-level-trig : Tx FIFO watermark level -- pl022,ctrl-len : Microwire interface: Control length -- pl022,wait-state : Microwire interface: Wait state -- pl022,duplex : Microwire interface: Full/Half duplex - - -Example: - - spi@e0100000 { - compatible = "arm,pl022", "arm,primecell"; - reg = <0xe0100000 0x1000>; - #address-cells = <1>; - #size-cells = <0>; - interrupts = <0 31 0x4>; - dmas = <&dma-controller 23 1>, - <&dma-controller 24 0>; - dma-names = "rx", "tx"; - - m25p80@1 { - compatible = "st,m25p80"; - reg = <1>; - spi-max-frequency = <12000000>; - spi-cpol; - spi-cpha; - pl022,interface = <0>; - pl022,com-mode = <0x2>; - pl022,rx-level-trig = <0>; - pl022,tx-level-trig = <0>; - pl022,ctrl-len = <0x11>; - pl022,wait-state = <0>; - pl022,duplex = <0>; - }; - }; -- cgit From 1305d97b7c7847b12872818326f0cb4da0439311 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 5 May 2019 10:00:22 -0400 Subject: media: dt-bindings: marvell,mmp2-ccic: Add Marvell MMP2 camera Add Marvell MMP2 camera host interface. Signed-off-by: Lubomir Rintel Reviewed-by: Rob Herring Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- .../bindings/media/marvell,mmp2-ccic.txt | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt new file mode 100644 index 000000000000..7ec2c8c8a3b9 --- /dev/null +++ b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt @@ -0,0 +1,50 @@ +Marvell MMP2 camera host interface + +Required properties: + - compatible: Should be "marvell,mmp2-ccic". + - reg: Register base and size. + - interrupts: The interrupt number. + - #clock-cells: Must be 0. + +Optional properties: + - clocks: Reference to the input clock as specified by + Documentation/devicetree/bindings/clock/clock-bindings.txt. + - clock-names: Names of the clocks used; "axi" for the AXI bus interface, + "func" for the peripheral clock and "phy" for the parallel + video bus interface. + - clock-output-names: Optional clock source for sensors. Shall be "mclk". + +Required subnodes: + - port: The parallel bus interface port with a single endpoint linked to + the sensor's endpoint as described in + Documentation/devicetree/bindings/media/video-interfaces.txt. + +Required endpoint properties: + - bus-type: data bus type, <5> or <6> for Parallel or Bt.656 respectively + - pclk-sample: pixel clock polarity + - hsync-active: horizontal synchronization polarity (only required for + parallel bus) + - vsync-active: vertical synchronization polarity (only required for + parallel bus) + +Example: + + camera0: camera@d420a000 { + compatible = "marvell,mmp2-ccic"; + reg = <0xd420a000 0x800>; + interrupts = <42>; + clocks = <&soc_clocks MMP2_CLK_CCIC0>; + clock-names = "axi"; + #clock-cells = <0>; + clock-output-names = "mclk"; + + port { + camera0_0: endpoint { + remote-endpoint = <&ov7670_0>; + bus-type = <5>; /* Parallel */ + hsync-active = <1>; /* Active high */ + vsync-active = <1>; /* Active high */ + pclk-sample = <0>; /* Falling */ + }; + }; + }; -- cgit From 9db9b76767f133d0e1ce19f01117c83221641899 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 20 May 2019 13:52:53 -0700 Subject: Documentation/x86: Fix path to entry_32.S Signed-off-by: Ira Weiny Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20190520205253.23762-1-ira.weiny@intel.com [ Adjusted the patch to the RST conversion. ] Signed-off-by: Ingo Molnar --- Documentation/x86/exception-tables.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/x86/exception-tables.rst b/Documentation/x86/exception-tables.rst index 24596c8210b5..ed6d4b0cf62c 100644 --- a/Documentation/x86/exception-tables.rst +++ b/Documentation/x86/exception-tables.rst @@ -35,7 +35,7 @@ page fault handler:: void do_page_fault(struct pt_regs *regs, unsigned long error_code) in arch/x86/mm/fault.c. The parameters on the stack are set up by -the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter +the low level assembly glue in arch/x86/entry/entry_32.S. The parameter regs is a pointer to the saved registers on the stack, error_code contains a reason code for the exception. -- cgit From 2c9239c125f0c657d2955780f364168d4c15aa8b Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Fri, 10 May 2019 17:45:26 +0200 Subject: dt-bindings: pinctrl: Convert stm32 pinctrl bindings to json-schema Convert the STM32 pinctrl binding to DT schema format using json-schema. Signed-off-by: Alexandre Torgue Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- .../bindings/pinctrl/st,stm32-pinctrl.txt | 208 ---------------- .../bindings/pinctrl/st,stm32-pinctrl.yaml | 264 +++++++++++++++++++++ 2 files changed, 264 insertions(+), 208 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt create mode 100644 Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt deleted file mode 100644 index 00169255e48c..000000000000 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt +++ /dev/null @@ -1,208 +0,0 @@ -* STM32 GPIO and Pin Mux/Config controller - -STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware -controller. It controls the input/output settings on the available pins and -also provides ability to multiplex and configure the output of various on-chip -controllers onto these pads. - -Pin controller node: -Required properies: - - compatible: value should be one of the following: - "st,stm32f429-pinctrl" - "st,stm32f469-pinctrl" - "st,stm32f746-pinctrl" - "st,stm32f769-pinctrl" - "st,stm32h743-pinctrl" - "st,stm32mp157-pinctrl" - "st,stm32mp157-z-pinctrl" - - #address-cells: The value of this property must be 1 - - #size-cells : The value of this property must be 1 - - ranges : defines mapping between pin controller node (parent) to - gpio-bank node (children). - - pins-are-numbered: Specify the subnodes are using numbered pinmux to - specify pins. - -GPIO controller/bank node: -Required properties: - - gpio-controller : Indicates this device is a GPIO controller - - #gpio-cells : Should be two. - The first cell is the pin number - The second one is the polarity: - - 0 for active high - - 1 for active low - - reg : The gpio address range, relative to the pinctrl range - - clocks : clock that drives this bank - - st,bank-name : Should be a name string for this bank as specified in - the datasheet - -Optional properties: - - reset: : Reference to the reset controller - - st,syscfg: Should be phandle/offset/mask. - -The phandle to the syscon node which includes IRQ mux selection register. - -The offset of the IRQ mux selection register - -The field mask of IRQ mux, needed if different of 0xf. - - gpio-ranges: Define a dedicated mapping between a pin-controller and - a gpio controller. Format is <&phandle a b c> with: - -(phandle): phandle of pin-controller. - -(a): gpio base offset in range. - -(b): pin base offset in range. - -(c): gpio count in range - This entry has to be used either if there are holes inside a bank: - GPIOB0/B1/B2/B14/B15 (see example 2) - or if banks are not contiguous: - GPIOA/B/C/E... - NOTE: If "gpio-ranges" is used for a gpio controller, all gpio-controller - have to use a "gpio-ranges" entry. - More details in Documentation/devicetree/bindings/gpio/gpio.txt. - - st,bank-ioport: should correspond to the EXTI IOport selection (EXTI line - used to select GPIOs as interrupts). - - hwlocks: reference to a phandle of a hardware spinlock provider node. - - st,package: Indicates the SOC package used. - More details in include/dt-bindings/pinctrl/stm32-pinfunc.h - -Example 1: -#include -... - - pin-controller { - #address-cells = <1>; - #size-cells = <1>; - compatible = "st,stm32f429-pinctrl"; - ranges = <0 0x40020000 0x3000>; - pins-are-numbered; - - gpioa: gpio@40020000 { - gpio-controller; - #gpio-cells = <2>; - reg = <0x0 0x400>; - resets = <&reset_ahb1 0>; - st,bank-name = "GPIOA"; - }; - ... - pin-functions nodes follow... - }; - -Example 2: -#include -... - - pinctrl: pin-controller { - #address-cells = <1>; - #size-cells = <1>; - compatible = "st,stm32f429-pinctrl"; - ranges = <0 0x40020000 0x3000>; - pins-are-numbered; - - gpioa: gpio@40020000 { - gpio-controller; - #gpio-cells = <2>; - reg = <0x0 0x400>; - resets = <&reset_ahb1 0>; - st,bank-name = "GPIOA"; - gpio-ranges = <&pinctrl 0 0 16>; - }; - - gpiob: gpio@40020400 { - gpio-controller; - #gpio-cells = <2>; - reg = <0x0 0x400>; - resets = <&reset_ahb1 0>; - st,bank-name = "GPIOB"; - ngpios = 4; - gpio-ranges = <&pinctrl 0 16 3>, - <&pinctrl 14 30 2>; - }; - - - ... - pin-functions nodes follow... - }; - - -Contents of function subnode node: ----------------------------------- -Subnode format -A pinctrl node should contain at least one subnode representing the -pinctrl group available on the machine. Each subnode will list the -pins it needs, and how they should be configured, with regard to muxer -configuration, pullups, drive, output high/low and output speed. - - node { - pinmux = ; - GENERIC_PINCONFIG; - }; - -Required properties: -- pinmux: integer array, represents gpio pin number and mux setting. - Supported pin number and mux varies for different SoCs, and are defined in - dt-bindings/pinctrl/-pinfunc.h directly. - These defines are calculated as: - ((port * 16 + line) << 8) | function - With: - - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11) - - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15) - - function: The function number, can be: - * 0 : GPIO - * 1 : Alternate Function 0 - * 2 : Alternate Function 1 - * 3 : Alternate Function 2 - * ... - * 16 : Alternate Function 15 - * 17 : Analog - - To simplify the usage, macro is available to generate "pinmux" field. - This macro is available here: - - include/dt-bindings/pinctrl/stm32-pinfunc.h - - Some examples of using macro: - /* GPIO A9 set as alernate function 2 */ - ... { - pinmux = ; - }; - /* GPIO A9 set as GPIO */ - ... { - pinmux = ; - }; - /* GPIO A9 set as analog */ - ... { - pinmux = ; - }; - -Optional properties: -- GENERIC_PINCONFIG: is the generic pinconfig options to use. - Available options are: - - bias-disable, - - bias-pull-down, - - bias-pull-up, - - drive-push-pull, - - drive-open-drain, - - output-low - - output-high - - slew-rate = , with x being: - < 0 > : Low speed - < 1 > : Medium speed - < 2 > : Fast speed - < 3 > : High speed - -Example: - -pin-controller { -... - usart1_pins_a: usart1@0 { - pins1 { - pinmux = ; - bias-disable; - drive-push-pull; - slew-rate = <0>; - }; - pins2 { - pinmux = ; - bias-disable; - }; - }; -}; - -&usart1 { - pinctrl-0 = <&usart1_pins_a>; - pinctrl-names = "default"; -}; diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml new file mode 100644 index 000000000000..06c4b66c3ee6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -0,0 +1,264 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# Copyright (C) STMicroelectronics 2019. +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/st,stm32-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: STM32 GPIO and Pin Mux/Config controller + +maintainers: + - Alexandre TORGUE + +description: | + STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware + controller. It controls the input/output settings on the available pins and + also provides ability to multiplex and configure the output of various + on-chip controllers onto these pads. + +properties: + compatible: + enum: + - st,stm32f429-pinctrl + - st,stm32f469-pinctrl + - st,stm32f746-pinctrl + - st,stm32f769-pinctrl + - st,stm32h743-pinctrl + - st,stm32mp157-pinctrl + - st,stm32mp157-z-pinctrl + + '#address-cells': + const: 1 + '#size-cells': + const: 1 + + ranges: true + pins-are-numbered: true + hwlocks: true + + st,syscfg: + $ref: "/schemas/types.yaml#/definitions/phandle-array" + description: Should be phandle/offset/mask + items: + - description: Phandle to the syscon node which includes IRQ mux selection. + - description: The offset of the IRQ mux selection register. + - description: The field mask of IRQ mux, needed if different of 0xf. + + st,package: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - enum: [1, 2, 4, 8] + description: + Indicates the SOC package used. + More details in include/dt-bindings/pinctrl/stm32-pinfunc.h + + +patternProperties: + '^gpio@[0-9a-f]*$': + properties: + gpio-controller: true + '#gpio-cells': + const: 2 + + reg: + maxItems: 1 + clocks: + maxItems: 1 + reset: + minItems: 1 + maxItems: 1 + gpio-ranges: + minItems: 1 + maxItems: 16 + ngpios: + description: + Number of available gpios in a bank. + minimum: 1 + maximum: 16 + + st,bank-name: + allOf: + - $ref: "/schemas/types.yaml#/definitions/string" + - enum: + - GPIOA + - GPIOB + - GPIOC + - GPIOD + - GPIOE + - GPIOF + - GPIOG + - GPIOH + - GPIOI + - GPIOJ + - GPIOK + - GPIOZ + description: + Should be a name string for this bank as specified in the datasheet. + + st,bank-ioport: + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32" + - minimum: 0 + - maximum: 11 + + description: + Should correspond to the EXTI IOport selection (EXTI line used + to select GPIOs as interrupts). + + required: + - gpio-controller + - '#gpio-cells' + - reg + - clocks + - st,bank-name + + '-[0-9]*$': + patternProperties: + '^pins': + description: | + A pinctrl node should contain at least one subnode representing the + pinctrl group available on the machine. Each subnode will list the + pins it needs, and how they should be configured, with regard to muxer + configuration, pullups, drive, output high/low and output speed. + properties: + pinmux: + allOf: + - $ref: "/schemas/types.yaml#/definitions/uint32-array" + description: | + Integer array, represents gpio pin number and mux setting. + Supported pin number and mux varies for different SoCs, and are + defined in dt-bindings/pinctrl/-pinfunc.h directly. + These defines are calculated as: ((port * 16 + line) << 8) | function + With: + - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11) + - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15) + - function: The function number, can be: + * 0 : GPIO + * 1 : Alternate Function 0 + * 2 : Alternate Function 1 + * 3 : Alternate Function 2 + * ... + * 16 : Alternate Function 15 + * 17 : Analog + To simplify the usage, macro is available to generate "pinmux" field. + This macro is available here: + - include/dt-bindings/pinctrl/stm32-pinfunc.h + Some examples of using macro: + /* GPIO A9 set as alernate function 2 */ + ... { + pinmux = ; + }; + /* GPIO A9 set as GPIO */ + ... { + pinmux = ; + }; + /* GPIO A9 set as analog */ + ... { + pinmux = ; + }; + + bias-disable: + type: boolean + bias-pull-down: + type: boolean + bias-pull-up: + type: boolean + drive-push-pull: + type: boolean + drive-open-drain: + type: boolean + output-low: + type: boolean + output-high: + type: boolean + slew-rate: + description: | + 0: Low speed + 1: Medium speed + 2: Fast speed + 3: High speed + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - enum: [0, 1, 2, 3] + + required: + - pinmux + +required: + - compatible + - '#address-cells' + - '#size-cells' + - ranges + - pins-are-numbered + +examples: + - | + #include + //Example 1 + pinctrl@40020000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "st,stm32f429-pinctrl"; + ranges = <0 0x40020000 0x3000>; + pins-are-numbered; + + gpioa: gpio@0 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x0 0x400>; + resets = <&reset_ahb1 0>; + st,bank-name = "GPIOA"; + }; + }; + + //Example 2 (using gpio-ranges) + pinctrl@50020000 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "st,stm32f429-pinctrl"; + ranges = <0 0x50020000 0x3000>; + pins-are-numbered; + + gpiob: gpio@1000 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x1000 0x400>; + resets = <&reset_ahb1 0>; + st,bank-name = "GPIOB"; + gpio-ranges = <&pinctrl 0 0 16>; + }; + + gpioc: gpio@2000 { + gpio-controller; + #gpio-cells = <2>; + reg = <0x2000 0x400>; + resets = <&reset_ahb1 0>; + st,bank-name = "GPIOC"; + ngpios = <5>; + gpio-ranges = <&pinctrl 0 16 3>, + <&pinctrl 14 30 2>; + }; + }; + + //Example 3 pin groups + pinctrl@60020000 { + usart1_pins_a: usart1-0 { + pins1 { + pinmux = ; + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; + bias-disable; + }; + }; + }; + + usart1 { + pinctrl-0 = <&usart1_pins_a>; + pinctrl-names = "default"; + }; + +... -- cgit From 1254db248fce4e2cf05a01c1c8df3b79745424c0 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 16 May 2019 17:13:38 +0200 Subject: dt-bindings: pinctrl: meson: add output support in pinconf add support for the pinconf DT property output-enable, output-disable, output-low and output-high in the meson pinctrl driver. Signed-off-by: Jerome Brunet Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt index a7618605bf1e..10dc4f7176ca 100644 --- a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt @@ -47,9 +47,15 @@ Required properties for pinmux nodes are: Required properties for configuration nodes: - pins: a list of pin names -Configuration nodes support the generic properties "bias-disable", -"bias-pull-up" and "bias-pull-down", described in file -pinctrl-bindings.txt +Configuration nodes support the following generic properties, as +described in file pinctrl-bindings.txt: + - "bias-disable" + - "bias-pull-up" + - "bias-pull-down" + - "output-enable" + - "output-disable" + - "output-low" + - "output-high" Optional properties : - drive-strength-microamp: Drive strength for the specified pins in uA. -- cgit From 13531e5d359e30d9e3d1cabd246a24cf6fdf084a Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 20 May 2019 14:00:57 +0530 Subject: dt-bindings: pinctrl: Modify pinctrl memory map Earlier, the PWM registers were included as part of the pinctrl memory map, but this turned to be useless as the muxing is being handled by the SoC pin controller itself. So, lets modify the pinctrl memory map to reflect the same. Fixes: 07b734fbdea2 ("dt-bindings: pinctrl: Add BM1880 pinctrl binding") Signed-off-by: Manivannan Sadhasivam Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt index ed34bb1ee81c..cc9a89aa4170 100644 --- a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt @@ -85,9 +85,9 @@ Required Properties: spi0 Example: - pinctrl: pinctrl@50 { + pinctrl: pinctrl@400 { compatible = "bitmain,bm1880-pinctrl"; - reg = <0x50 0x4B0>; + reg = <0x400 0x120>; pinctrl_uart0_default: uart0-default { pinmux { -- cgit From 752a74038dbd999c108f1c7474908f7501e246bb Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 20 May 2019 14:01:00 +0530 Subject: dt-bindings: pinctrl: Document pinconf bindings for BM1880 SoC Document pinconf bindings for Bitmain BM1880 SoC. Signed-off-by: Manivannan Sadhasivam Signed-off-by: Linus Walleij --- .../bindings/pinctrl/bitmain,bm1880-pinctrl.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt index cc9a89aa4170..4eb089bcb5f3 100644 --- a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt @@ -14,7 +14,8 @@ phrase "pin configuration node". The pin configuration nodes act as a container for an arbitrary number of subnodes. Each of these subnodes represents some desired configuration for a pin, a group, or a list of pins or groups. This configuration for BM1880 SoC -includes only pinmux as there is no pinconf support available in SoC. +includes pinmux and various pin configuration parameters, such as pull-up, +slew rate etc... Each configuration node can consist of multiple nodes describing the pinmux options. The name of each subnode is not important; all subnodes should be @@ -84,6 +85,22 @@ Required Properties: gpio66, gpio67, eth1, i2s0, i2s0_mclkin, i2s1, i2s1_mclkin, spi0 +Optional Properties: + +- bias-disable: No arguments. Disable pin bias. +- bias-pull-down: No arguments. The specified pins should be configured as + pull down. +- bias-pull-up: No arguments. The specified pins should be configured as + pull up. +- input-schmitt-enable: No arguments: Enable schmitt trigger for the specified + pins +- input-schmitt-disable: No arguments: Disable schmitt trigger for the specified + pins +- slew-rate: Integer. Sets slew rate for the specified pins. + Valid values are: + <0> - Slow + <1> - Fast + Example: pinctrl: pinctrl@400 { compatible = "bitmain,bm1880-pinctrl"; -- cgit From e0917169e5cc068a0c791726329746d1f4752b7a Mon Sep 17 00:00:00 2001 From: Clément Péron Date: Thu, 23 May 2019 17:10:47 +0200 Subject: dt-bindings: watchdog: add Allwinner H6 watchdog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allwinner H6 has a similar watchdog as the A64 which is already a compatible of the A31. This commit add the H6 compatible. Signed-off-by: Clément Péron Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt index 46055254e8dd..e65198d82a2b 100644 --- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt @@ -6,6 +6,7 @@ Required properties: "allwinner,sun4i-a10-wdt" "allwinner,sun6i-a31-wdt" "allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt" + "allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt" "allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt" - reg : Specifies base physical address and size of the registers. -- cgit From ef98682a4e1257ff45405bb1372939e8dfe774bb Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 11 May 2019 00:15:22 +0530 Subject: dt-bindings: reset: Add devicetree binding for BM1880 reset controller Add devicetree binding for Bitmain BM1880 SoC reset controller. Signed-off-by: Manivannan Sadhasivam Reviewed-by: Rob Herring Signed-off-by: Philipp Zabel --- .../devicetree/bindings/reset/bitmain,bm1880-reset.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt new file mode 100644 index 000000000000..a6f8455ae6c4 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt @@ -0,0 +1,18 @@ +Bitmain BM1880 SoC Reset Controller +=================================== + +Please also refer to reset.txt in this directory for common reset +controller binding usage. + +Required properties: +- compatible: Should be "bitmain,bm1880-reset" +- reg: Offset and length of reset controller space in SCTRL. +- #reset-cells: Must be 1. + +Example: + + rst: reset-controller@c00 { + compatible = "bitmain,bm1880-reset"; + reg = <0xc00 0x8>; + #reset-cells = <1>; + }; -- cgit From ad8288b89d4f02d184391457cf655e5f68943341 Mon Sep 17 00:00:00 2001 From: "Y.b. Lu" Date: Thu, 23 May 2019 02:33:37 +0000 Subject: dt-binding: ptp_qoriq: support ENETC PTP compatible Add a new compatible for ENETC PTP. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/ptp/ptp-qoriq.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt index 454c937076a2..6ec053449278 100644 --- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt +++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt @@ -4,6 +4,7 @@ General Properties: - compatible Should be "fsl,etsec-ptp" for eTSEC Should be "fsl,fman-ptp-timer" for DPAA FMan + Should be "fsl,enetc-ptp" for ENETC - reg Offset and length of the register set for the device - interrupts There should be at least two interrupts. Some devices have as many as four PTP related interrupts. -- cgit From e7c787cb2697ee4c101109bf21b4fc397cea150b Mon Sep 17 00:00:00 2001 From: Christian Mauderer Date: Mon, 13 May 2019 21:33:06 +0200 Subject: dt-bindings: leds: Add binding for spi-byte LED. This patch adds the binding documentation for a simple SPI based LED controller which use only one byte for setting the brightness. Signed-off-by: Christian Mauderer Reviewed-by: Rob Herring Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/leds-spi-byte.txt | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 Documentation/devicetree/bindings/leds/leds-spi-byte.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/leds/leds-spi-byte.txt b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt new file mode 100644 index 000000000000..28b6b2d9091e --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt @@ -0,0 +1,44 @@ +* Single Byte SPI LED Device Driver. + +The driver can be used for controllers with a very simple SPI protocol: +- one LED is controlled by a single byte on MOSI +- the value of the byte gives the brightness between two values (lowest to + highest) +- no return value is necessary (no MISO signal) + +The value for lowest and highest brightness is dependent on the device and +therefore on the compatible string. + +Depending on the compatible string some special functions (like hardware +accelerated blinking) might can be supported too. + +The driver currently only supports one LED. The properties of the LED are +configured in a sub-node in the device node. + +Required properties: +- compatible: should be one of + * "ubnt,acb-spi-led" microcontroller (SONiX 8F26E611LA) based device + used for example in Ubiquiti airCube ISP + +Property rules described in Documentation/devicetree/bindings/spi/spi-bus.txt +apply. + +LED sub-node properties: +- label: + see Documentation/devicetree/bindings/leds/common.txt +- default-state: + see Documentation/devicetree/bindings/leds/common.txt + Only "on" and "off" are supported. + +Example: + +led-controller@0 { + compatible = "ubnt,acb-spi-led"; + reg = <0>; + spi-max-frequency = <100000>; + + led { + label = "white:status"; + default-state = "on"; + }; +}; -- cgit From a5f6f88c3d1a453dd35cbaac2870f5fae866ad2e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 14:22:36 -0600 Subject: docs: Do not seek comments in kernel/rcu/tree_plugin.h There are no kerneldoc comments in this file, so do not attempt to include them in the docs build. Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 2 -- Documentation/driver-api/basics.rst | 3 --- 2 files changed, 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index a29c99d13331..a53ec2eb8176 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -358,8 +358,6 @@ Read-Copy Update (RCU) .. kernel-doc:: kernel/rcu/tree.c -.. kernel-doc:: kernel/rcu/tree_plugin.h - .. kernel-doc:: kernel/rcu/tree_exp.h .. kernel-doc:: kernel/rcu/update.c diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index e970fadf4d1a..1ba88c7b3984 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -115,9 +115,6 @@ Kernel utility functions .. kernel-doc:: kernel/rcu/tree.c :export: -.. kernel-doc:: kernel/rcu/tree_plugin.h - :export: - .. kernel-doc:: kernel/rcu/update.c :export: -- cgit From e8d4f892bb245702ee23abfcd28eb98b5eca6c86 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 14:31:50 -0600 Subject: docs: Fix a misdirected kerneldoc directive The stratix10 service layer documentation tried to include a kerneldoc comments for a nonexistent struct; leading to a "no structured comments found" message. Switch it to stratix10_svc_command_config_type, which appears at that spot in the sequence and was not included. Signed-off-by: Jonathan Corbet --- Documentation/driver-api/firmware/other_interfaces.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/firmware/other_interfaces.rst b/Documentation/driver-api/firmware/other_interfaces.rst index a4ac54b5fd79..b81794e0cfbb 100644 --- a/Documentation/driver-api/firmware/other_interfaces.rst +++ b/Documentation/driver-api/firmware/other_interfaces.rst @@ -33,7 +33,7 @@ of the requests on to a secure monitor (EL3). :functions: stratix10_svc_client_msg .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h - :functions: stratix10_svc_command_reconfig_payload + :functions: stratix10_svc_command_config_type .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h :functions: stratix10_svc_cb_data -- cgit From 79b647a0c0d52a03feb8b39a6311b01bb2bf87a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 May 2019 20:07:56 +0200 Subject: dt-bindings: net: document new usxgmii phy mode Add new interface mode USXGMII to binding documentation. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index e88c3641d613..5475682bf06e 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -43,6 +43,7 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. * "rxaui" * "xaui" * "10gbase-kr" (10GBASE-KR, XFI, SFI) + * "usxgmii" - phy-connection-type: the same as "phy-mode" property but described in the Devicetree Specification; - phy-handle: phandle, specifies a reference to a node representing a PHY -- cgit From 1372bbe66a59528e65d10ae67416835d3ec2452e Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 6 May 2019 14:16:12 -0500 Subject: dt-bindings: ti-lmu: Modify dt bindings for the LM3697 The LM3697 is a single function LED driver. The single function LED driver needs to reside in the LED directory as a dedicated LED driver and not as a MFD device. The device does have common brightness and ramp features and those can be accomodated by a TI LMU framework. The LM3697 dt binding needs to be moved from the ti-lmu.txt and a dedicated LED dt binding needs to be added. The new LM3697 LED dt binding will then reside in the Documentation/devicetree/bindings/leds directory and follow the current LED and general bindings guidelines. Signed-off-by: Dan Murphy Reviewed-by: Rob Herring Acked-by: Lee Jones Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/leds-lm3697.txt | 73 ++++++++++++++++++++++ Documentation/devicetree/bindings/mfd/ti-lmu.txt | 27 +------- 2 files changed, 74 insertions(+), 26 deletions(-) create mode 100644 Documentation/devicetree/bindings/leds/leds-lm3697.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/leds/leds-lm3697.txt b/Documentation/devicetree/bindings/leds/leds-lm3697.txt new file mode 100644 index 000000000000..63992d732959 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-lm3697.txt @@ -0,0 +1,73 @@ +* Texas Instruments - LM3697 Highly Efficient White LED Driver + +The LM3697 11-bit LED driver provides high- +performance backlight dimming for 1, 2, or 3 series +LED strings while delivering up to 90% efficiency. + +This device is suitable for display and keypad lighting + +Required properties: + - compatible: + "ti,lm3697" + - reg : I2C slave address + - #address-cells : 1 + - #size-cells : 0 + +Optional properties: + - enable-gpios : GPIO pin to enable/disable the device + - vled-supply : LED supply + +Required child properties: + - reg : 0 - LED is Controlled by bank A + 1 - LED is Controlled by bank B + - led-sources : Indicates which HVLED string is associated to which + control bank. This is a zero based property so + HVLED1 = 0, HVLED2 = 1, HVLED3 = 2. + Additional information is contained + in Documentation/devicetree/bindings/leds/common.txt + +Optional child properties: + - ti,brightness-resolution - see Documentation/devicetree/bindings/mfd/ti-lmu.txt + - ramp-up-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt + - ramp-down-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt + - label : see Documentation/devicetree/bindings/leds/common.txt + - linux,default-trigger : + see Documentation/devicetree/bindings/leds/common.txt + +Example: + +HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is +controlled by control bank B. + +led-controller@36 { + compatible = "ti,lm3697"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x36>; + + enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>; + vled-supply = <&vbatt>; + + led@0 { + reg = <0>; + led-sources = <0 2>; + ti,brightness-resolution = <2047>; + ramp-up-us = <5000>; + ramp-down-us = <1000>; + label = "white:first_backlight_cluster"; + linux,default-trigger = "backlight"; + }; + + led@1 { + reg = <1>; + led-sources = <1>; + ti,brightness-resolution = <255>; + ramp-up-us = <500>; + ramp-down-us = <1000>; + label = "white:second_backlight_cluster"; + linux,default-trigger = "backlight"; + }; +} + +For more product information please see the link below: +http://www.ti.com/lit/ds/symlink/lm3697.pdf diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt index e3efefb194c5..782d3c9812ed 100644 --- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt +++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt @@ -8,7 +8,6 @@ TI LMU driver supports lighting devices below. LM3632 Backlight and regulator LM3633 Backlight, LED and fault monitor LM3695 Backlight - LM3697 Backlight and fault monitor Required properties: - compatible: Should be one of: @@ -16,11 +15,10 @@ Required properties: "ti,lm3632" "ti,lm3633" "ti,lm3695" - "ti,lm3697" - reg: I2C slave address. 0x11 for LM3632 0x29 for LM3631 - 0x36 for LM3633, LM3697 + 0x36 for LM3633 0x63 for LM3695 Optional properties: @@ -51,7 +49,6 @@ Optional nodes: Required properties: - compatible: Should be one of: "ti,lm3633-fault-monitor" - "ti,lm3697-fault-monitor" - leds: LED properties for LM3633. Please refer to [2]. - regulators: Regulator properties for LM3631 and LM3632. Please refer to [3]. @@ -216,25 +213,3 @@ lm3695@63 { }; }; }; - -lm3697@36 { - compatible = "ti,lm3697"; - reg = <0x36>; - - enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; - - backlight { - compatible = "ti,lm3697-backlight"; - - lcd { - ti,brightness-resolution = <255>; - led-sources = <0 1 2>; - ramp-up-us = <200000>; - ramp-down-us = <200000>; - }; - }; - - fault-monitor { - compatible = "ti,lm3697-fault-monitor"; - }; -}; -- cgit From 41ce14e39bbe0683a2d49385ee8a8cb0b1d010eb Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 14:43:42 -0600 Subject: docs: Do not seek kerneldoc comments in hw-consumer.h There are no kerneldoc comments here, so looking for them just yields a warning in the docs build. Signed-off-by: Jonathan Corbet --- Documentation/driver-api/iio/hw-consumer.rst | 1 - 1 file changed, 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/iio/hw-consumer.rst b/Documentation/driver-api/iio/hw-consumer.rst index e0fe0b98230e..819fb9edc005 100644 --- a/Documentation/driver-api/iio/hw-consumer.rst +++ b/Documentation/driver-api/iio/hw-consumer.rst @@ -45,7 +45,6 @@ A typical IIO HW consumer setup looks like this:: More details ============ -.. kernel-doc:: include/linux/iio/hw-consumer.h .. kernel-doc:: drivers/iio/buffer/industrialio-hw-consumer.c :export: -- cgit From 2496f17772f757ba7a58c2abc1cdfdaf7cca29fb Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 23 May 2019 10:14:15 +0200 Subject: dt-bindings: Add vendor prefix for Espressif Add Espressif Systems DT vendor prefix. That prefix has been used for quite some time for WiFi chips, but has never been documented. Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 33a65a45e319..c4d4f6714814 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -277,6 +277,8 @@ patternProperties: description: Ecole Polytechnique Fédérale de Lausanne "^epson,.*": description: Seiko Epson Corp. + "^esp,.*": + description: Espressif Systems Co. Ltd. "^est,.*": description: ESTeem Wireless Modems "^ettus,.*": -- cgit From 3aef4472665695be7cbdd2cc274814f56d36e4ef Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 15:01:30 -0600 Subject: docs: No structured comments in target_core_device.c Documentation/driver-api/target.rst is seeking kerneldoc comments in drivers/target/target_core_device.c, but no such comments exist. Take out the kernel-doc directive and eliminate one warning from the build. Signed-off-by: Jonathan Corbet --- Documentation/driver-api/target.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/target.rst b/Documentation/driver-api/target.rst index 4363611dd86d..620ec6173a93 100644 --- a/Documentation/driver-api/target.rst +++ b/Documentation/driver-api/target.rst @@ -10,8 +10,8 @@ TBD Target core device interfaces ============================= -.. kernel-doc:: drivers/target/target_core_device.c - :export: +This section is blank because no kerneldoc comments have been added to +drivers/target/target_core_device.c. Target core transport interfaces ================================ -- cgit From dea20be5063c97bdac48e81ee2a85975f14885ed Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 15:03:39 -0600 Subject: docs: no structured comments in fs/file_table.c Remove the kernel-doc directive, since there are only warnings to be found there. Signed-off-by: Jonathan Corbet --- Documentation/filesystems/api-summary.rst | 3 --- 1 file changed, 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst index aa51ffcfa029..bbb0c1c0e5cf 100644 --- a/Documentation/filesystems/api-summary.rst +++ b/Documentation/filesystems/api-summary.rst @@ -89,9 +89,6 @@ Other Functions .. kernel-doc:: fs/direct-io.c :export: -.. kernel-doc:: fs/file_table.c - :export: - .. kernel-doc:: fs/libfs.c :export: -- cgit From 3f715b147a6c5245ee25d7334f4053c339feef98 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 24 May 2019 15:05:41 -0600 Subject: docs: No structured comments in include/linux/interconnect.h Remove the kernel-doc directive for this file, since there's nothing there and it generates a warning. Signed-off-by: Jonathan Corbet --- Documentation/interconnect/interconnect.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/interconnect/interconnect.rst index b8107dcc4cd3..c3e004893796 100644 --- a/Documentation/interconnect/interconnect.rst +++ b/Documentation/interconnect/interconnect.rst @@ -89,6 +89,5 @@ Interconnect consumers Interconnect consumers are the clients which use the interconnect APIs to get paths between endpoints and set their bandwidth/latency/QoS requirements -for these interconnect paths. - -.. kernel-doc:: include/linux/interconnect.h +for these interconnect paths. These interfaces are not currently +documented. -- cgit From 253a41c6fbadd1305e25f84ffc455f2b4d57439c Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 14 May 2019 13:40:51 -0700 Subject: dt-bindings: Remove Linuxisms from common-properties binding We shouldn't reference Linux kernel functions or Linux itself in proper bindings. It's OK to reference functions in the kernel when explaining examples, but otherwise we shouldn't reference functions to describe what the binding means. Cc: Hsin-Yi Wang Signed-off-by: Stephen Boyd Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/common-properties.txt | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/common-properties.txt b/Documentation/devicetree/bindings/common-properties.txt index a3448bfa1c82..98a28130e100 100644 --- a/Documentation/devicetree/bindings/common-properties.txt +++ b/Documentation/devicetree/bindings/common-properties.txt @@ -5,30 +5,29 @@ Endianness ---------- The Devicetree Specification does not define any properties related to hardware -byteswapping, but endianness issues show up frequently in porting Linux to +byte swapping, but endianness issues show up frequently in porting drivers to different machine types. This document attempts to provide a consistent -way of handling byteswapping across drivers. +way of handling byte swapping across drivers. Optional properties: - big-endian: Boolean; force big endian register accesses unconditionally (e.g. ioread32be/iowrite32be). Use this if you - know the peripheral always needs to be accessed in BE mode. + know the peripheral always needs to be accessed in big endian (BE) mode. - little-endian: Boolean; force little endian register accesses unconditionally (e.g. readl/writel). Use this if you know the - peripheral always needs to be accessed in LE mode. + peripheral always needs to be accessed in little endian (LE) mode. - native-endian: Boolean; always use register accesses matched to the endianness of the kernel binary (e.g. LE vmlinux -> readl/writel, - BE vmlinux -> ioread32be/iowrite32be). In this case no byteswaps + BE vmlinux -> ioread32be/iowrite32be). In this case no byte swaps will ever be performed. Use this if the hardware "self-adjusts" register endianness based on the CPU's configured endianness. If a binding supports these properties, then the binding should also specify the default behavior if none of these properties are present. In such cases, little-endian is the preferred default, but it is not -a requirement. The of_device_is_big_endian() and of_fdt_is_big_endian() -helper functions do assume that little-endian is the default, because -most existing (PCI-based) drivers implicitly default to LE by using -readl/writel for MMIO accesses. +a requirement. Some implementations assume that little-endian is +the default, because most existing (PCI-based) drivers implicitly +default to LE for their MMIO accesses. Examples: Scenario 1 : CPU in LE mode & device in LE mode. -- cgit From 48d07c04b4cc1dc1221965312f58fd84926212fe Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Mar 2019 22:13:33 +0100 Subject: rcu: Enable elimination of Tree-RCU softirq processing Some workloads need to change kthread priority for RCU core processing without affecting other softirq work. This commit therefore introduces the rcutree.use_softirq kernel boot parameter, which moves the RCU core work from softirq to a per-CPU SCHED_OTHER kthread named rcuc. Use of SCHED_OTHER approach avoids the scalability problems that appeared with the earlier attempt to move RCU core processing to from softirq to kthreads. That said, kernels built with RCU_BOOST=y will run the rcuc kthreads at the RCU-boosting priority. Note that rcutree.use_softirq=0 must be specified to move RCU core processing to the rcuc kthreads: rcutree.use_softirq=1 is the default. Reported-by: Thomas Gleixner Tested-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Adjust for invoke_rcu_callbacks() only ever being invoked from RCU core processing, in contrast to softirq->rcuc transition in old mainline RCU priority boosting. ] [ paulmck: Avoid wakeups when scheduler might have invoked rcu_read_unlock() while holding rq or pi locks, also possibly fixing a pre-existing latent bug involving raise_softirq()-induced wakeups. ] Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b96fd15c7316 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3752,6 +3752,12 @@ the propagation of recent CPU-hotplug changes up the rcu_node combining tree. + rcutree.use_softirq= [KNL] + If set to zero, move all RCU_SOFTIRQ processing to + per-CPU rcuc kthreads. Defaults to a non-zero + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining tree. This is used by rcutorture, and might -- cgit From 88903c464321cdbc2d473c24cbf311f576cf05bc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:30 +0900 Subject: tracing/probe: Add ustring type for user-space string Add "ustring" type for fetching user-space string from kprobe event. User can specify ustring type at uprobe event, and it is same as "string" for uprobe. Note that probe-event provides this option but it doesn't choose the correct type automatically since we have not way to decide the address is in user-space or not on some arch (and on some other arch, you can fetch the string by "string" type). So user must carefully check the target code (e.g. if you see __user on the target variable) and use this new type. Link: http://lkml.kernel.org/r/155789871009.26965.14167558859557329331.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 235ce2ab131a..a3ac7c9ac242 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -55,7 +55,8 @@ Synopsis of kprobe_events NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types - (x8/x16/x32/x64), "string" and bitfield are supported. + (x8/x16/x32/x64), "string", "ustring" and bitfield + are supported. (\*1) only for the probe on function entry (offs == 0). (\*2) only for return probe. @@ -77,7 +78,11 @@ apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is wrong, but '+8($stack):x8[8]' is OK.) String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container -has been paged out. +has been paged out. "ustring" type is an alternative of string for user-space. +Note that kprobe-event provides string/ustring types, but doesn't change it +automatically. So user has to decide if the targe string in kernel or in user +space carefully. On some arch, if you choose wrong one, it always fails to +record string data. The string array type is a bit different from other types. For other base types, [1] is equal to (e.g. +0(%di):x32[1] is same as +0(%di):x32.) But string[1] is not equal to string. The string type itself -- cgit From e65f7ae7f4da56622ecf8f1eaed333b9a13f9435 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:42 +0900 Subject: tracing/probe: Support user-space dereference Support user-space dereference syntax for probe event arguments to dereference the data-structure or array in user-space. The syntax is just adding 'u' before an offset value. +|-u() e.g. +u8(%ax), +u0(+0(%si)) For example, if you probe do_sched_setscheduler(pid, policy, param) and record param->sched_priority, you can add new probe as below; p do_sched_setscheduler priority=+u0($arg3) Note that kprobe event provides this and it doesn't change the dereference method automatically because we do not know whether the given address is in userspace or kernel on some archs. So as same as "ustring", this is an option for user, who has to carefully choose the dereference method. Link: http://lkml.kernel.org/r/155789872187.26965.4468456816590888687.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 27 ++++++++++++++++++++++----- Documentation/trace/uprobetracer.rst | 10 ++++++---- 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index a3ac7c9ac242..09ff474493e1 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -51,7 +51,7 @@ Synopsis of kprobe_events $argN : Fetch the Nth function argument. (N >= 1) (\*1) $retval : Fetch return value.(\*2) $comm : Fetch current task comm. - +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(\*3) + +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types @@ -61,6 +61,7 @@ Synopsis of kprobe_events (\*1) only for the probe on function entry (offs == 0). (\*2) only for return probe. (\*3) this is useful for fetching a field of data structures. + (\*4) "u" means user-space dereference. See :ref:`user_mem_access`. Types ----- @@ -79,10 +80,7 @@ wrong, but '+8($stack):x8[8]' is OK.) String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. "ustring" type is an alternative of string for user-space. -Note that kprobe-event provides string/ustring types, but doesn't change it -automatically. So user has to decide if the targe string in kernel or in user -space carefully. On some arch, if you choose wrong one, it always fails to -record string data. +See :ref:`user_mem_access` for more info.. The string array type is a bit different from other types. For other base types, [1] is equal to (e.g. +0(%di):x32[1] is same as +0(%di):x32.) But string[1] is not equal to string. The string type itself @@ -97,6 +95,25 @@ Symbol type('symbol') is an alias of u32 or u64 type (depends on BITS_PER_LONG) which shows given pointer in "symbol+offset" style. For $comm, the default type is "string"; any other type is invalid. +.. _user_mem_access: +User Memory Access +------------------ +Kprobe events supports user-space memory access. For that purpose, you can use +either user-space dereference syntax or 'ustring' type. + +The user-space dereference syntax allows you to access a field of a data +structure in user-space. This is done by adding the "u" prefix to the +dereference syntax. For example, +u4(%si) means it will read memory from the +address in the register %si offset by 4, and the memory is expected to be in +user-space. You can use this for strings too, e.g. +u0(%si):string will read +a string from the address in the register %si that is expected to be in user- +space. 'ustring' is a shortcut way of performing the same task. That is, ++0(%si):ustring is equivalent to +u0(%si):string. + +Note that kprobe-event provides the user-memory access syntax but it doesn't +use it transparently. This means if you use normal dereference or string type +for user memory, it might fail, and may always fail on some archs. The user +has to carefully check if the target data is in kernel or user space. Per-Probe Event Filtering ------------------------- diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index 4346e23e3ae7..ab13319c66ac 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -42,16 +42,18 @@ Synopsis of uprobe_tracer @+OFFSET : Fetch memory at OFFSET (OFFSET from same file as PATH) $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. - $retval : Fetch return value.(*) + $retval : Fetch return value.(\*1) $comm : Fetch current task comm. - +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) + +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*2)(\*3) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types (x8/x16/x32/x64), "string" and bitfield are supported. - (*) only for return probe. - (**) this is useful for fetching a field of data structures. + (\*1) only for return probe. + (\*2) this is useful for fetching a field of data structures. + (\*3) Unlike kprobe event, "u" prefix will just be ignored, becuse uprobe + events can access only user-space memory. Types ----- -- cgit From 970988e19eb0a0dc24fe14bf91972019e48336e2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2019 17:32:35 +0900 Subject: tracing/kprobe: Add kprobe_event= boot parameter Add kprobe_event= boot parameter to define kprobe events at boot time. The definition syntax is similar to tracefs/kprobe_events interface, but use ',' and ';' instead of ' ' and '\n' respectively. e.g. kprobe_event=p,vfs_read,$arg1,$arg2 This puts a probe on vfs_read with argument1 and 2, and enable the new event. Link: http://lkml.kernel.org/r/155851395498.15728.830529496248543583.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/admin-guide/kernel-parameters.txt | 13 +++++++++++++ Documentation/trace/kprobetrace.rst | 14 ++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..11b9ffb265eb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2007,6 +2007,19 @@ Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the default is off. + kprobe_event=[probe-list] + [FTRACE] Add kprobe events and enable at boot time. + The probe-list is a semicolon delimited list of probe + definitions. Each definition is same as kprobe_events + interface, but the parameters are comma delimited. + For example, to add a kprobe event on vfs_read with + arg1 and arg2, add to the command line; + + kprobe_event=p,vfs_read,$arg1,$arg2 + + See also Documentation/trace/kprobetrace.rst "Kernel + Boot Parameter" section. + kpti= [ARM64] Control page table isolation of user and kernel address spaces. Default: enabled on cores which need mitigation. diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 09ff474493e1..af776989caca 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -146,6 +146,20 @@ You can check the total number of probe hits and probe miss-hits via The first column is event name, the second is the number of probe hits, the third is the number of probe miss-hits. +Kernel Boot Parameter +--------------------- +You can add and enable new kprobe events when booting up the kernel by +"kprobe_event=" parameter. The parameter accepts a semicolon-delimited +kprobe events, which format is similar to the kprobe_events. +The difference is that the probe definition parameters are comma-delimited +instead of space. For example, adding myprobe event on do_sys_open like below + + p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack) + +should be below for kernel boot parameter (just replace spaces with comma) + + p:myprobe,do_sys_open,dfd=%ax,filename=%dx,flags=%cx,mode=+4($stack) + Usage examples -------------- -- cgit From 8e2c67f9960d32edcd16c846c947b2a05dad32ad Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 25 May 2019 15:41:36 +0200 Subject: dt-bindings: vendor: Escape single quote Single quotes need to be escaped in YAML, make sure it's the case. Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index c4d4f6714814..3bb61e4b4597 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -668,7 +668,7 @@ patternProperties: "^plantower,.*": description: Plantower Co., Ltd "^plathome,.*": - description: Plat'Home Co., Ltd. + description: Plat\'Home Co., Ltd. "^plda,.*": description: PLDA "^plx,.*": -- cgit From b361797f3d0b5accf23af848a089589fb8ce5f80 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 25 May 2019 15:41:37 +0200 Subject: dt-bindings: vendor: Fix simtek vendor compatible In the text file, simtek didn't have any description and apparently this confused the conversion script. Fix the simtek entry and add a proper description. Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 3bb61e4b4597..653dee9d7dd0 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -783,8 +783,8 @@ patternProperties: description: Silergy Corp. "^siliconmitus,.*": description: Silicon Mitus, Inc. - "^simte,.*": - description: k + "^simtek,.*": + description: Cypress Semiconductor Corporation (Simtek Corporation) "^sirf,.*": description: SiRF Technology, Inc. "^sis,.*": -- cgit From 462409365b698a62a93ddf834b6221e28b76652a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 May 2019 13:12:22 +0200 Subject: dt-bindings: net: add qca,ar71xx.txt documentation Add binding documentation for Atheros/QCA networking IP core used in many routers. Signed-off-by: Oleksij Rempel Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/qca,ar71xx.txt | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/qca,ar71xx.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.txt b/Documentation/devicetree/bindings/net/qca,ar71xx.txt new file mode 100644 index 000000000000..2a33e71ba72b --- /dev/null +++ b/Documentation/devicetree/bindings/net/qca,ar71xx.txt @@ -0,0 +1,45 @@ +Required properties: +- compatible: Should be "qca,-eth". Currently support compatibles are: + qca,ar7100-eth - Atheros AR7100 + qca,ar7240-eth - Atheros AR7240 + qca,ar7241-eth - Atheros AR7241 + qca,ar7242-eth - Atheros AR7242 + qca,ar9130-eth - Atheros AR9130 + qca,ar9330-eth - Atheros AR9330 + qca,ar9340-eth - Atheros AR9340 + qca,qca9530-eth - Qualcomm Atheros QCA9530 + qca,qca9550-eth - Qualcomm Atheros QCA9550 + qca,qca9560-eth - Qualcomm Atheros QCA9560 + +- reg : Address and length of the register set for the device +- interrupts : Should contain eth interrupt +- phy-mode : See ethernet.txt file in the same directory +- clocks: the clock used by the core +- clock-names: the names of the clock listed in the clocks property. These are + "eth" and "mdio". +- resets: Should contain phandles to the reset signals +- reset-names: Should contain the names of reset signal listed in the resets + property. These are "mac" and "mdio" + +Optional properties: +- phy-handle : phandle to the PHY device connected to this device. +- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. + Use instead of phy-handle. + +Optional subnodes: +- mdio : specifies the mdio bus, used as a container for phy nodes + according to phy.txt in the same directory + +Example: + +ethernet@1a000000 { + compatible = "qca,ar9330-eth"; + reg = <0x1a000000 0x200>; + interrupts = <5>; + resets = <&rst 13>, <&rst 23>; + reset-names = "mac", "mdio"; + clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_MDIO>; + clock-names = "eth", "mdio"; + + phy-mode = "gmii"; +}; -- cgit From e618795367df99c463f5e4e0ddee9612fcce6b06 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 25 May 2019 21:42:28 +0100 Subject: dt-bindings: pinctrl: fix spelling mistakes in pinctl documentation The spelling of configured is incorrect in the documentation. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Bjorn Andersson Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt | 6 +++--- Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt | 6 +++--- 11 files changed, 33 insertions(+), 33 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt index 68e93d5b7ede..c9782397ff14 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt @@ -122,17 +122,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt index 6dd72f8599e9..7b151894f5a0 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt @@ -118,17 +118,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt index 86ecdcfc4fb8..d46973968873 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt @@ -97,17 +97,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt index 195a7a0ef0cc..3354a63296d9 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt @@ -130,17 +130,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt index 5034eb6653c7..a7dd213c77c6 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt @@ -124,17 +124,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt index f15443f6e78e..da52df6273bc 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt @@ -128,17 +128,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt index fa97f609fe45..a56cb882830c 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt @@ -149,17 +149,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt index e70c79bbbc5b..00174f08ba1d 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt @@ -135,17 +135,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt index 2b8f77762edc..a50e74684195 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt @@ -150,17 +150,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt index 769ca83bb40d..be034d329e10 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt @@ -142,17 +142,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt index 665aadb5ea28..321bdb9be0d2 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt @@ -118,17 +118,17 @@ to specify in a pin configuration subnode: - bias-disable: Usage: optional Value type: - Definition: The specified pins should be configued as no pull. + Definition: The specified pins should be configured as no pull. - bias-pull-down: Usage: optional Value type: - Definition: The specified pins should be configued as pull down. + Definition: The specified pins should be configured as pull down. - bias-pull-up: Usage: optional Value type: - Definition: The specified pins should be configued as pull up. + Definition: The specified pins should be configured as pull up. - output-high: Usage: optional -- cgit From 303b3cf95ed322ab781cafe4c279e58755d3d8ee Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 27 May 2019 18:41:57 -0700 Subject: dt-bindings: add more optional properties for elan_i2c touchpads Some new touchpads IC are connected through PS/2 and I2C. On some of these new IC, the I2C part doesn't have all of the information available. We need to be able to forward the touchpad parameters from PS/2 and thus, we need those new optional properties. Signed-off-by: Benjamin Tissoires Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/elan_i2c.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/input/elan_i2c.txt b/Documentation/devicetree/bindings/input/elan_i2c.txt index 797607460735..9963247706f2 100644 --- a/Documentation/devicetree/bindings/input/elan_i2c.txt +++ b/Documentation/devicetree/bindings/input/elan_i2c.txt @@ -13,9 +13,20 @@ Optional properties: pinctrl binding [1]). - vcc-supply: a phandle for the regulator supplying 3.3V power. - elan,trackpoint: touchpad can support a trackpoint (boolean) +- elan,clickpad: touchpad is a clickpad (the entire surface is a button) +- elan,middle-button: touchpad has a physical middle button +- elan,x_traces: number of antennas on the x axis +- elan,y_traces: number of antennas on the y axis +- some generic touchscreen properties [2]: + * touchscreen-size-x + * touchscreen-size-y + * touchscreen-x-mm + * touchscreen-y-mm + [0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt [1]: Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt +[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt Example: &i2c1 { -- cgit From 5e343fbb7176a082dec8632f72999aba82f0e720 Mon Sep 17 00:00:00 2001 From: Przemyslaw Gaj Date: Tue, 16 Apr 2019 09:36:15 +0100 Subject: dt-bindings: i3c: Document dropped support for I2C 10 bit devices Because this patch series dropped support for 10 bit I2C devices, I'm documenting this. Signed-off-by: Przemyslaw Gaj Signed-off-by: Boris Brezillon --- Documentation/devicetree/bindings/i3c/i3c.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/i3c/i3c.txt b/Documentation/devicetree/bindings/i3c/i3c.txt index ab729a0a86ae..4ffe059f0fec 100644 --- a/Documentation/devicetree/bindings/i3c/i3c.txt +++ b/Documentation/devicetree/bindings/i3c/i3c.txt @@ -39,7 +39,9 @@ valid here, but several new properties have been added. New constraint on existing properties: -------------------------------------- - reg: contains 3 cells - + first cell : still encoding the I2C address + + first cell : still encoding the I2C address. 10 bit addressing is not + supported. Devices with 10 bit address can't be properly passed through + DEFSLVS command. + second cell: shall be 0 -- cgit From 4b3d50062ce06530adc27224a1e10f087d0c9caf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 23 May 2019 10:17:36 +0200 Subject: gpio: Fix minor grammar errors in documentation This fixes up some of my own mistakes when I stressed to refresh the documentation. Signed-off-by: Linus Walleij --- Documentation/driver-api/gpio/driver.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index 1ce7fcd0f989..58036c2d84d2 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -235,7 +235,7 @@ means that a pull up or pull-down resistor is available on the output of the GPIO line, and this resistor is software controlled. In discrete designs, a pull-up or pull-down resistor is simply soldered on -the circuit board. This is not something we deal or model in software. The +the circuit board. This is not something we deal with or model in software. The most you will think about these lines is that they will very likely be configured as open drain or open source (see the section above). @@ -292,18 +292,18 @@ We can divide GPIO irqchips in two broad categories: - HIERARCHICAL INTERRUPT CHIPS: this means that each GPIO line has a dedicated irq line to a parent interrupt controller one level up. There is no need - to inquire the GPIO hardware to figure out which line has figured, but it - may still be necessary to acknowledge the interrupt and set up the - configuration such as edge sensitivity. + to inquire the GPIO hardware to figure out which line has fired, but it + may still be necessary to acknowledge the interrupt and set up configuration + such as edge sensitivity. Realtime considerations: a realtime compliant GPIO driver should not use spinlock_t or any sleepable APIs (like PM runtime) as part of its irqchip implementation. -- spinlock_t should be replaced with raw_spinlock_t [1]. +- spinlock_t should be replaced with raw_spinlock_t.[1] - If sleepable APIs have to be used, these can be done from the .irq_bus_lock() and .irq_bus_unlock() callbacks, as these are the only slowpath callbacks - on an irqchip. Create the callbacks if needed [2]. + on an irqchip. Create the callbacks if needed.[2] Cascaded GPIO irqchips @@ -361,7 +361,7 @@ Cascaded GPIO irqchips usually fall in one of three categories: Realtime considerations: this kind of handlers will be forced threaded on -RT, and as result the IRQ core will complain that generic_handle_irq() is called - with IRQ enabled and the same work around as for "CHAINED GPIO irqchips" can + with IRQ enabled and the same work-around as for "CHAINED GPIO irqchips" can be applied. - NESTED THREADED GPIO IRQCHIPS: these are off-chip GPIO expanders and any -- cgit From 919c46c89bff432f45994259d28d774212cf6e8d Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Fri, 10 May 2019 11:03:39 +0200 Subject: Documentation: gpio: remove duplicated lines The 'default (active high)' lines are repeated twice. Avoid people stare at their screens looking for differences. Signed-off-by: Luca Ceresoli Reviewed-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- Documentation/driver-api/gpio/consumer.rst | 2 -- 1 file changed, 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index 5e4d8aa68913..23d68c321c5c 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -283,8 +283,6 @@ To summarize:: gpiod_set_value(desc, 1); default (active high) high gpiod_set_value(desc, 0); active low high gpiod_set_value(desc, 1); active low low - gpiod_set_value(desc, 0); default (active high) low - gpiod_set_value(desc, 1); default (active high) high gpiod_set_value(desc, 0); open drain low gpiod_set_value(desc, 1); open drain high impedance gpiod_set_value(desc, 0); open source high impedance -- cgit From 910f38bed9439e765f7e240edfe32b5f04ad92f5 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 13 May 2019 19:50:33 -0500 Subject: dt-bindings: gpio: Convert Arm PL061 to json-schema Convert the Arm PL061 GPIO controller binding to json-schema format. As I'm the author for all but the gpio-ranges line, make the schema dual GPL/BSD license. Cc: Bartosz Golaszewski Cc: linux-gpio@vger.kernel.org Signed-off-by: Rob Herring Signed-off-by: Linus Walleij --- .../devicetree/bindings/gpio/pl061-gpio.txt | 10 ---- .../devicetree/bindings/gpio/pl061-gpio.yaml | 69 ++++++++++++++++++++++ 2 files changed, 69 insertions(+), 10 deletions(-) delete mode 100644 Documentation/devicetree/bindings/gpio/pl061-gpio.txt create mode 100644 Documentation/devicetree/bindings/gpio/pl061-gpio.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt b/Documentation/devicetree/bindings/gpio/pl061-gpio.txt deleted file mode 100644 index 89058d375b7c..000000000000 --- a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt +++ /dev/null @@ -1,10 +0,0 @@ -ARM PL061 GPIO controller - -Required properties: -- compatible : "arm,pl061", "arm,primecell" -- #gpio-cells : Should be two. The first cell is the pin number and the - second cell is used to specify optional parameters: - - bit 0 specifies polarity (0 for normal, 1 for inverted) -- gpio-controller : Marks the device node as a GPIO controller. -- interrupts : Interrupt mapping for GPIO IRQ. -- gpio-ranges : Interaction with the PINCTRL subsystem. diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml new file mode 100644 index 000000000000..313b17229247 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/pl061-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ARM PL061 GPIO controller + +maintainers: + - Linus Walleij + - Rob Herring + +# We need a select here so we don't match all nodes with 'arm,primecell' +select: + properties: + compatible: + contains: + const: arm,pl061 + required: + - compatible + +properties: + $nodename: + pattern: "^gpio@[0-9a-f]+$" + + compatible: + items: + - const: arm,pl061 + - const: arm,primecell + + reg: + maxItems: 1 + + interrupts: + oneOf: + - maxItems: 1 + - maxItems: 8 + + interrupt-controller: true + + "#interrupt-cells": + const: 2 + + clocks: + maxItems: 1 + + clock-names: true + + "#gpio-cells": + const: 2 + + gpio-controller: true + + gpio-ranges: + maxItems: 8 + +required: + - compatible + - reg + - interrupts + - interrupt-controller + - "#interrupt-cells" + - clocks + - "#gpio-cells" + - gpio-controller + +additionalProperties: false + +... -- cgit From b2045303147254d01b1db90a83e5df3832c4264b Mon Sep 17 00:00:00 2001 From: Clément Péron Date: Mon, 27 May 2019 22:06:21 +0200 Subject: dt-bindings: sound: sun4i-spdif: Add Allwinner H6 compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allwinner H6 has a SPDIF controller with an increase of the fifo size and a sligher difference in memory mapping compare to H3/A64. This make it not compatible with the previous generation. Introduce a specific bindings for H6 SoC. Signed-off-by: Clément Péron Reviewed-by: Rob Herring Acked-by: Maxime Ripard Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml index a49ef2294a74..e0284d8c3b63 100644 --- a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml @@ -21,6 +21,7 @@ properties: - const: allwinner,sun4i-a10-spdif - const: allwinner,sun6i-a31-spdif - const: allwinner,sun8i-h3-spdif + - const: allwinner,sun50i-h6-spdif - items: - const: allwinner,sun8i-a83t-spdif - const: allwinner,sun8i-h3-spdif -- cgit From 0a0ca8e94ca36d2153c2fbea69a31f792bfc5831 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 24 May 2019 14:57:58 +0200 Subject: dt-bindings: sound: Convert Allwinner I2S binding to YAML The Allwinner SoCs feature an I2S controller across multiple SoC generations. However, earlier generations were a bit simpler than the subsequent ones, and for example would always have RX and TX capabilities, and no reset lines. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/sound/allwinner,sun4i-a10-i2s.yaml | 100 +++++++++++++++++++++ .../devicetree/bindings/sound/sun4i-i2s.txt | 45 ---------- 2 files changed, 100 insertions(+), 45 deletions(-) create mode 100644 Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml delete mode 100644 Documentation/devicetree/bindings/sound/sun4i-i2s.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml new file mode 100644 index 000000000000..85b2d6d84055 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: (GPL-2.0+ OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-i2s.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 I2S Controller Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#sound-dai-cells": + const: 0 + + compatible: + oneOf: + - const: allwinner,sun4i-a10-i2s + - const: allwinner,sun6i-a31-i2s + - const: allwinner,sun8i-a83t-i2s + - const: allwinner,sun8i-h3-i2s + - const: allwinner,sun50i-a64-codec-i2s + - items: + - const: allwinner,sun50i-a64-i2s + - const: allwinner,sun8i-h3-i2s + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: Bus Clock + - description: Module Clock + + clock-names: + items: + - const: apb + - const: mod + + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + + # Even though it only applies to subschemas under the conditionals, + # not listing them here will trigger a warning because of the + # additionalsProperties set to false. + resets: + maxItems: 1 + +allOf: + - if: + properties: + compatible: + contains: + enum: + - allwinner,sun6i-a31-i2s + - allwinner,sun8i-a83t-i2s + - allwinner,sun8i-h3-i2s + - allwinner,sun50i-a64-codec-i2s + + then: + required: + - resets + +required: + - "#sound-dai-cells" + - compatible + - reg + - interrupts + - clocks + - clock-names + - dmas + - dma-names + +additionalProperties: false + +examples: + - | + i2s0: i2s@1c22400 { + #sound-dai-cells = <0>; + compatible = "allwinner,sun4i-a10-i2s"; + reg = <0x01c22400 0x400>; + interrupts = <0 16 4>; + clocks = <&apb0_gates 3>, <&i2s0_clk>; + clock-names = "apb", "mod"; + dmas = <&dma 0 3>, <&dma 0 3>; + dma-names = "rx", "tx"; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt deleted file mode 100644 index 61e71c1729e0..000000000000 --- a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt +++ /dev/null @@ -1,45 +0,0 @@ -* Allwinner A10 I2S controller - -The I2S bus (Inter-IC sound bus) is a serial link for digital -audio data transfer between devices in the system. - -Required properties: - -- compatible: should be one of the following: - - "allwinner,sun4i-a10-i2s" - - "allwinner,sun6i-a31-i2s" - - "allwinner,sun8i-a83t-i2s" - - "allwinner,sun8i-h3-i2s" - - "allwinner,sun50i-a64-codec-i2s" -- reg: physical base address of the controller and length of memory mapped - region. -- interrupts: should contain the I2S interrupt. -- dmas: DMA specifiers for tx and rx dma. See the DMA client binding, - Documentation/devicetree/bindings/dma/dma.txt -- dma-names: should include "tx" and "rx". -- clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names. -- clock-names: should contain the following: - - "apb" : clock for the I2S bus interface - - "mod" : module clock for the I2S controller -- #sound-dai-cells : Must be equal to 0 - -Required properties for the following compatibles: - - "allwinner,sun6i-a31-i2s" - - "allwinner,sun8i-a83t-i2s" - - "allwinner,sun8i-h3-i2s" - - "allwinner,sun50i-a64-codec-i2s" -- resets: phandle to the reset line for this codec - -Example: - -i2s0: i2s@1c22400 { - #sound-dai-cells = <0>; - compatible = "allwinner,sun4i-a10-i2s"; - reg = <0x01c22400 0x400>; - interrupts = ; - clocks = <&apb0_gates 3>, <&i2s0_clk>; - clock-names = "apb", "mod"; - dmas = <&dma SUN4I_DMA_NORMAL 3>, - <&dma SUN4I_DMA_NORMAL 3>; - dma-names = "rx", "tx"; -}; -- cgit From eb5b12843b067d685a8d7a191b928b07934b2d02 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 24 May 2019 14:57:59 +0200 Subject: dt-bindings: sound: sun4i-i2s: Document that the RX channel can be missing The A83t and compatibles controllers don't have any reception capabilities on some instances of the controllers, even though it was never documented as such in the binding before. Therefore, on those controllers, we don't have the option to set an RX DMA channel. This was already done in the DTSI, but the binding itself was never updated. Let's add a special case in the schemas. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../bindings/sound/allwinner,sun4i-a10-i2s.yaml | 52 +++++++++++++++++----- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml index 85b2d6d84055..eb3992138eec 100644 --- a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml @@ -41,19 +41,11 @@ properties: - const: apb - const: mod - dmas: - items: - - description: RX DMA Channel - - description: TX DMA Channel - - dma-names: - items: - - const: rx - - const: tx - # Even though it only applies to subschemas under the conditionals, # not listing them here will trigger a warning because of the # additionalsProperties set to false. + dmas: true + dma-names: true resets: maxItems: 1 @@ -72,6 +64,46 @@ allOf: required: - resets + - if: + properties: + compatible: + contains: + const: allwinner,sun8i-a83t-i2s + + then: + properties: + dmas: + minItems: 1 + maxItems: 2 + items: + - description: RX DMA Channel + - description: TX DMA Channel + description: + Some controllers cannot receive but can only transmit + data. In such a case, the RX DMA channel is to be omitted. + + dma-names: + oneOf: + - items: + - const: rx + - const: tx + - const: tx + description: + Some controllers cannot receive but can only transmit + data. In such a case, the RX name is to be omitted. + + else: + properties: + dmas: + items: + - description: RX DMA Channel + - description: TX DMA Channel + + dma-names: + items: + - const: rx + - const: tx + required: - "#sound-dai-cells" - compatible -- cgit From de1dbcee433ccff3e0a37698c65b40542c9d4cf1 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 29 Mar 2019 10:05:55 -0400 Subject: doc/rcuref: Document real world examples in kernel Document similar real world examples in the kernel corresponding to the second and third code snippets. Also correct an issue in release_referenced() in the code snippet example. Cc: oleg@redhat.com Cc: jannh@google.com Signed-off-by: Joel Fernandes (Google) [ paulmck: Do a bit of wordsmithing. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcuref.txt | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt index 613033ff2b9b..5e6429d66c24 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.txt @@ -12,6 +12,7 @@ please read on. Reference counting on elements of lists which are protected by traditional reader/writer spinlocks or semaphores are straightforward: +CODE LISTING A: 1. 2. add() search_and_reference() { { @@ -28,7 +29,8 @@ add() search_and_reference() release_referenced() delete() { { ... write_lock(&list_lock); - atomic_dec(&el->rc, relfunc) ... + if(atomic_dec_and_test(&el->rc)) ... + kfree(el); ... remove_element } write_unlock(&list_lock); ... @@ -44,6 +46,7 @@ search_and_reference() could potentially hold reference to an element which has already been deleted from the list/array. Use atomic_inc_not_zero() in this scenario as follows: +CODE LISTING B: 1. 2. add() search_and_reference() { { @@ -79,6 +82,7 @@ search_and_reference() code path. In such cases, the atomic_dec_and_test() may be moved from delete() to el_free() as follows: +CODE LISTING C: 1. 2. add() search_and_reference() { { @@ -114,6 +118,17 @@ element can therefore safely be freed. This in turn guarantees that if any reader finds the element, that reader may safely acquire a reference without checking the value of the reference counter. +A clear advantage of the RCU-based pattern in listing C over the one +in listing B is that any call to search_and_reference() that locates +a given object will succeed in obtaining a reference to that object, +even given a concurrent invocation of delete() for that same object. +Similarly, a clear advantage of both listings B and C over listing A is +that a call to delete() is not delayed even if there are an arbitrarily +large number of calls to search_and_reference() searching for the same +object that delete() was invoked on. Instead, all that is delayed is +the eventual invocation of kfree(), which is usually not a problem on +modern computer systems, even the small ones. + In cases where delete() can sleep, synchronize_rcu() can be called from delete(), so that el_free() can be subsumed into delete as follows: @@ -130,3 +145,7 @@ delete() kfree(el); ... } + +As additional examples in the kernel, the pattern in listing C is used by +reference counting of struct pid, while the pattern in listing B is used by +struct posix_acl. -- cgit From 588759a39145e18dd808341861d45d44383778b5 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 14 Apr 2019 11:11:03 +0800 Subject: doc: Fixup definition of rcupdate.rcu_task_stall_timeout A positive value of rcupdate.rcu_task_stall_timeout is an interval in seconds rather than jiffies. Signed-off-by: Zhenzhong Duan Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1ab70c37921f..13e88fc00f01 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -153,7 +153,7 @@ rcupdate.rcu_task_stall_timeout This boot/sysfs parameter controls the RCU-tasks stall warning interval. A value of zero or less suppresses RCU-tasks stall warnings. A positive value sets the stall-warning interval - in jiffies. An RCU-tasks stall warning starts with the line: + in seconds. An RCU-tasks stall warning starts with the line: INFO: rcu_tasks detected stalls on tasks: -- cgit From 714b6904e23e1c37f262a4cd02b34d0f1863e227 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 07:12:18 -0700 Subject: doc: Remove ".vnet" from paulmck email addresses Signed-off-by: Paul E. McKenney --- Documentation/core-api/circular-buffers.rst | 2 +- Documentation/memory-barriers.txt | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/core-api/circular-buffers.rst b/Documentation/core-api/circular-buffers.rst index 53e51caa3347..50966f66e398 100644 --- a/Documentation/core-api/circular-buffers.rst +++ b/Documentation/core-api/circular-buffers.rst @@ -3,7 +3,7 @@ Circular Buffers ================ :Author: David Howells -:Author: Paul E. McKenney +:Author: Paul E. McKenney Linux provides a number of features that can be used to implement circular diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f70ebcdfe592..e4e07c8ab89e 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -3,7 +3,7 @@ ============================ By: David Howells - Paul E. McKenney + Paul E. McKenney Will Deacon Peter Zijlstra diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index db0b9d8619f1..5f3c74dcad43 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -24,7 +24,7 @@ Documentation/memory-barriers.txt ========================= 저자: David Howells - Paul E. McKenney + Paul E. McKenney Will Deacon Peter Zijlstra -- cgit From a82c3149ad8b4a84f9737a633250815d5cf0cc5e Mon Sep 17 00:00:00 2001 From: Steve Longerbeam Date: Fri, 3 May 2019 19:45:07 -0400 Subject: media: docs-rst: Clarify older field vs. first transmitted field Add a paragraph to make it more clear that video equipment will transmit fields in the same order the fields were captured, and replace some of the "is transmitted first" language with "is the older field", since the latter is the important info for motion compensation applications. Signed-off-by: Steve Longerbeam Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/field-order.rst | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/media/uapi/v4l/field-order.rst b/Documentation/media/uapi/v4l/field-order.rst index d640e922a974..c422bebe4314 100644 --- a/Documentation/media/uapi/v4l/field-order.rst +++ b/Documentation/media/uapi/v4l/field-order.rst @@ -51,6 +51,11 @@ determined by the video standard. Hence the distinction between temporal and spatial order of fields. The diagrams below should make this clearer. +In V4L it is assumed that all video cameras transmit fields on the media +bus in the same order they were captured, so if the top field was +captured first (is the older field), the top field is also transmitted +first on the bus. + All video capture and output devices must report the current field order. Some drivers may permit the selection of a different order, to this end applications initialize the ``field`` field of struct @@ -101,10 +106,10 @@ enum v4l2_field * - ``V4L2_FIELD_INTERLACED`` - 4 - Images contain both fields, interleaved line by line. The temporal - order of the fields (whether the top or bottom field is first - transmitted) depends on the current video standard. M/NTSC - transmits the bottom field first, all other standards the top - field first. + order of the fields (whether the top or bottom field is older) + depends on the current video standard. In M/NTSC the bottom + field is the older field. In all other standards the top field + is the older field. * - ``V4L2_FIELD_SEQ_TB`` - 5 - Images contain both fields, the top field lines are stored first @@ -135,11 +140,11 @@ enum v4l2_field * - ``V4L2_FIELD_INTERLACED_TB`` - 8 - Images contain both fields, interleaved line by line, top field - first. The top field is transmitted first. + first. The top field is the older field. * - ``V4L2_FIELD_INTERLACED_BT`` - 9 - Images contain both fields, interleaved line by line, top field - first. The bottom field is transmitted first. + first. The bottom field is the older field. -- cgit From 5e27a314a11f7fa53795282eea59a024fd3020ba Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 27 May 2019 14:17:10 +0200 Subject: dt-bindings: interrupt-controller: Add Renesas RZ/A1 Interrupt Controller Add DT bindings for the Renesas RZ/A1 Interrupt Controller. Signed-off-by: Geert Uytterhoeven Reviewed-by: Simon Horman Reviewed-by: Rob Herring Signed-off-by: Marc Zyngier --- .../interrupt-controller/renesas,rza1-irqc.txt | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt new file mode 100644 index 000000000000..727b7e4cd6e0 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt @@ -0,0 +1,43 @@ +DT bindings for the Renesas RZ/A1 Interrupt Controller + +The RZ/A1 Interrupt Controller is a front-end for the GIC found on Renesas +RZ/A1 and RZ/A2 SoCs: + - IRQ sense select for 8 external interrupts, 1:1-mapped to 8 GIC SPI + interrupts, + - NMI edge select. + +Required properties: + - compatible: Must be "renesas,-irqc", and "renesas,rza1-irqc" as + fallback. + Examples with soctypes are: + - "renesas,r7s72100-irqc" (RZ/A1H) + - "renesas,r7s9210-irqc" (RZ/A2M) + - #interrupt-cells: Must be 2 (an interrupt index and flags, as defined + in interrupts.txt in this directory) + - #address-cells: Must be zero + - interrupt-controller: Marks the device as an interrupt controller + - reg: Base address and length of the memory resource used by the interrupt + controller + - interrupt-map: Specifies the mapping from external interrupts to GIC + interrupts + - interrupt-map-mask: Must be <7 0> + +Example: + + irqc: interrupt-controller@fcfef800 { + compatible = "renesas,r7s72100-irqc", "renesas,rza1-irqc"; + #interrupt-cells = <2>; + #address-cells = <0>; + interrupt-controller; + reg = <0xfcfef800 0x6>; + interrupt-map = + <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <1 0 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <2 0 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>, + <3 0 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>, + <4 0 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>, + <5 0 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>, + <6 0 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>, + <7 0 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>; + interrupt-map-mask = <7 0>; + }; -- cgit From 5902bca94ae05316ec7feab9b84cb07ffa5c1175 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 24 Apr 2019 06:43:47 -0400 Subject: media: v4l2-ctrl: add MPEG-2 profile and level controls Add MPEG-2 CID definitions for profiles and levels defined in ITU-T Rec. H.262. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/ext-ctrls-codec.rst | 56 ++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'Documentation') diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst index 4a8446203085..843c93e8e7bc 100644 --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst @@ -759,6 +759,32 @@ enum v4l2_mpeg_video_h264_level - +.. _v4l2-mpeg-video-mpeg2-level: + +``V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL`` + (enum) + +enum v4l2_mpeg_video_mpeg2_level - + The level information for the MPEG2 elementary stream. Applicable to + MPEG2 codecs. Possible values are: + + + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_LOW`` + - Low Level (LL) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_MAIN`` + - Main Level (ML) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH_1440`` + - High-1440 Level (H-14) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH`` + - High Level (HL) + + + .. _v4l2-mpeg-video-mpeg4-level: ``V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL`` @@ -845,6 +871,36 @@ enum v4l2_mpeg_video_h264_profile - +.. _v4l2-mpeg-video-mpeg2-profile: + +``V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE`` + (enum) + +enum v4l2_mpeg_video_mpeg2_profile - + The profile information for MPEG2. Applicable to MPEG2 codecs. + Possible values are: + + + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SIMPLE`` + - Simple profile (SP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MAIN`` + - Main profile (MP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SNR_SCALABLE`` + - SNR Scalable profile (SNR) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SPATIALLY_SCALABLE`` + - Spatially Scalable profile (Spt) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_HIGH`` + - High profile (HP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MULTIVIEW`` + - Multi-view profile (MVP) + + + .. _v4l2-mpeg-video-mpeg4-profile: ``V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE`` -- cgit From 1f0545d3ed1df3a915546cee60b56f855962ed69 Mon Sep 17 00:00:00 2001 From: Pawel Osciak Date: Fri, 24 May 2019 05:20:28 -0400 Subject: media: uapi: Add H264 low-level decoder API compound controls. Stateless video codecs will require both the H264 metadata and slices in order to be able to decode frames. This introduces the definitions for the structures used to pass the metadata from the userspace to the kernel. [hverkuil-cisco@xs4all.nl: add space after . in ".For"] [hverkuil-cisco@xs4all.nl: sync v4l2_ctrl_h264_decode_params struct layout with header] Co-developed-by: Maxime Ripard Reviewed-by: Paul Kocialkowski Reviewed-by: Tomasz Figa Signed-off-by: Pawel Osciak Signed-off-by: Guenter Roeck Signed-off-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/biblio.rst | 9 + Documentation/media/uapi/v4l/ext-ctrls-codec.rst | 569 ++++++++++++++++++++++ Documentation/media/uapi/v4l/vidioc-queryctrl.rst | 30 ++ Documentation/media/videodev2.h.rst.exceptions | 5 + 4 files changed, 613 insertions(+) (limited to 'Documentation') diff --git a/Documentation/media/uapi/v4l/biblio.rst b/Documentation/media/uapi/v4l/biblio.rst index ec33768c055e..8f4eb8823d82 100644 --- a/Documentation/media/uapi/v4l/biblio.rst +++ b/Documentation/media/uapi/v4l/biblio.rst @@ -122,6 +122,15 @@ ITU BT.1119 :author: International Telecommunication Union (http://www.itu.ch) +.. _h264: + +ITU-T Rec. H.264 Specification (04/2017 Edition) +================================================ + +:title: ITU-T Recommendation H.264 "Advanced Video Coding for Generic Audiovisual Services" + +:author: International Telecommunication Union (http://www.itu.ch) + .. _jfif: JFIF diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst index 843c93e8e7bc..d6ea2ffd65c5 100644 --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst @@ -1451,6 +1451,575 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type - - Layer number +.. _v4l2-mpeg-h264: + +``V4L2_CID_MPEG_VIDEO_H264_SPS (struct)`` + Specifies the sequence parameter set (as extracted from the + bitstream) for the associated H264 slice data. This includes the + necessary parameters for configuring a stateless hardware decoding + pipeline for H264. The bitstream parameters are defined according + to :ref:`h264`, section 7.4.2.1.1 "Sequence Parameter Set Data + Semantics". For further documentation, refer to the above + specification, unless there is an explicit comment stating + otherwise. + + .. note:: + + This compound control is not yet part of the public kernel API and + it is expected to change. + +.. c:type:: v4l2_ctrl_h264_sps + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_h264_sps + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``profile_idc`` + - + * - __u8 + - ``constraint_set_flags`` + - See :ref:`Sequence Parameter Set Constraints Set Flags ` + * - __u8 + - ``level_idc`` + - + * - __u8 + - ``seq_parameter_set_id`` + - + * - __u8 + - ``chroma_format_idc`` + - + * - __u8 + - ``bit_depth_luma_minus8`` + - + * - __u8 + - ``bit_depth_chroma_minus8`` + - + * - __u8 + - ``log2_max_frame_num_minus4`` + - + * - __u8 + - ``pic_order_cnt_type`` + - + * - __u8 + - ``log2_max_pic_order_cnt_lsb_minus4`` + - + * - __u8 + - ``max_num_ref_frames`` + - + * - __u8 + - ``num_ref_frames_in_pic_order_cnt_cycle`` + - + * - __s32 + - ``offset_for_ref_frame[255]`` + - + * - __s32 + - ``offset_for_non_ref_pic`` + - + * - __s32 + - ``offset_for_top_to_bottom_field`` + - + * - __u16 + - ``pic_width_in_mbs_minus1`` + - + * - __u16 + - ``pic_height_in_map_units_minus1`` + - + * - __u32 + - ``flags`` + - See :ref:`Sequence Parameter Set Flags ` + +.. _h264_sps_constraints_set_flags: + +``Sequence Parameter Set Constraints Set Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_SPS_CONSTRAINT_SET0_FLAG`` + - 0x00000001 + - + * - ``V4L2_H264_SPS_CONSTRAINT_SET1_FLAG`` + - 0x00000002 + - + * - ``V4L2_H264_SPS_CONSTRAINT_SET2_FLAG`` + - 0x00000004 + - + * - ``V4L2_H264_SPS_CONSTRAINT_SET3_FLAG`` + - 0x00000008 + - + * - ``V4L2_H264_SPS_CONSTRAINT_SET4_FLAG`` + - 0x00000010 + - + * - ``V4L2_H264_SPS_CONSTRAINT_SET5_FLAG`` + - 0x00000020 + - + +.. _h264_sps_flags: + +``Sequence Parameter Set Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE`` + - 0x00000001 + - + * - ``V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS`` + - 0x00000002 + - + * - ``V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO`` + - 0x00000004 + - + * - ``V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED`` + - 0x00000008 + - + * - ``V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY`` + - 0x00000010 + - + * - ``V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD`` + - 0x00000020 + - + * - ``V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE`` + - 0x00000040 + - + +``V4L2_CID_MPEG_VIDEO_H264_PPS (struct)`` + Specifies the picture parameter set (as extracted from the + bitstream) for the associated H264 slice data. This includes the + necessary parameters for configuring a stateless hardware decoding + pipeline for H264. The bitstream parameters are defined according + to :ref:`h264`, section 7.4.2.2 "Picture Parameter Set RBSP + Semantics". For further documentation, refer to the above + specification, unless there is an explicit comment stating + otherwise. + + .. note:: + + This compound control is not yet part of the public kernel API and + it is expected to change. + +.. c:type:: v4l2_ctrl_h264_pps + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_h264_pps + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``pic_parameter_set_id`` + - + * - __u8 + - ``seq_parameter_set_id`` + - + * - __u8 + - ``num_slice_groups_minus1`` + - + * - __u8 + - ``num_ref_idx_l0_default_active_minus1`` + - + * - __u8 + - ``num_ref_idx_l1_default_active_minus1`` + - + * - __u8 + - ``weighted_bipred_idc`` + - + * - __s8 + - ``pic_init_qp_minus26`` + - + * - __s8 + - ``pic_init_qs_minus26`` + - + * - __s8 + - ``chroma_qp_index_offset`` + - + * - __s8 + - ``second_chroma_qp_index_offset`` + - + * - __u16 + - ``flags`` + - See :ref:`Picture Parameter Set Flags ` + +.. _h264_pps_flags: + +``Picture Parameter Set Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE`` + - 0x00000001 + - + * - ``V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT`` + - 0x00000002 + - + * - ``V4L2_H264_PPS_FLAG_WEIGHTED_PRED`` + - 0x00000004 + - + * - ``V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT`` + - 0x00000008 + - + * - ``V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED`` + - 0x00000010 + - + * - ``V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT`` + - 0x00000020 + - + * - ``V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE`` + - 0x00000040 + - + * - ``V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT`` + - 0x00000080 + - + +``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX (struct)`` + Specifies the scaling matrix (as extracted from the bitstream) for + the associated H264 slice data. The bitstream parameters are + defined according to :ref:`h264`, section 7.4.2.1.1.1 "Scaling + List Semantics". For further documentation, refer to the above + specification, unless there is an explicit comment stating + otherwise. + + .. note:: + + This compound control is not yet part of the public kernel API and + it is expected to change. + +.. c:type:: v4l2_ctrl_h264_scaling_matrix + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_h264_scaling_matrix + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``scaling_list_4x4[6][16]`` + - + * - __u8 + - ``scaling_list_8x8[6][64]`` + - + +``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS (struct)`` + Specifies the slice parameters (as extracted from the bitstream) + for the associated H264 slice data. This includes the necessary + parameters for configuring a stateless hardware decoding pipeline + for H264. The bitstream parameters are defined according to + :ref:`h264`, section 7.4.3 "Slice Header Semantics". For further + documentation, refer to the above specification, unless there is + an explicit comment stating otherwise. + + .. note:: + + This compound control is not yet part of the public kernel API + and it is expected to change. + + This structure is expected to be passed as an array, with one + entry for each slice included in the bitstream buffer. + +.. c:type:: v4l2_ctrl_h264_slice_params + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_h264_slice_params + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u32 + - ``size`` + - + * - __u32 + - ``header_bit_size`` + - + * - __u16 + - ``first_mb_in_slice`` + - + * - __u8 + - ``slice_type`` + - + * - __u8 + - ``pic_parameter_set_id`` + - + * - __u8 + - ``colour_plane_id`` + - + * - __u8 + - ``redundant_pic_cnt`` + - + * - __u16 + - ``frame_num`` + - + * - __u16 + - ``idr_pic_id`` + - + * - __u16 + - ``pic_order_cnt_lsb`` + - + * - __s32 + - ``delta_pic_order_cnt_bottom`` + - + * - __s32 + - ``delta_pic_order_cnt0`` + - + * - __s32 + - ``delta_pic_order_cnt1`` + - + * - struct :c:type:`v4l2_h264_pred_weight_table` + - ``pred_weight_table`` + - + * - __u32 + - ``dec_ref_pic_marking_bit_size`` + - + * - __u32 + - ``pic_order_cnt_bit_size`` + - + * - __u8 + - ``cabac_init_idc`` + - + * - __s8 + - ``slice_qp_delta`` + - + * - __s8 + - ``slice_qs_delta`` + - + * - __u8 + - ``disable_deblocking_filter_idc`` + - + * - __s8 + - ``slice_alpha_c0_offset_div2`` + - + * - __s8 + - ``slice_beta_offset_div2`` + - + * - __u8 + - ``num_ref_idx_l0_active_minus1`` + - + * - __u8 + - ``num_ref_idx_l1_active_minus1`` + - + * - __u32 + - ``slice_group_change_cycle`` + - + * - __u8 + - ``ref_pic_list0[32]`` + - Reference picture list after applying the per-slice modifications + * - __u8 + - ``ref_pic_list1[32]`` + - Reference picture list after applying the per-slice modifications + * - __u32 + - ``flags`` + - See :ref:`Slice Parameter Flags ` + +.. _h264_slice_flags: + +``Slice Parameter Set Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_SLICE_FLAG_FIELD_PIC`` + - 0x00000001 + - + * - ``V4L2_H264_SLICE_FLAG_BOTTOM_FIELD`` + - 0x00000002 + - + * - ``V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED`` + - 0x00000004 + - + * - ``V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH`` + - 0x00000008 + - + +``Prediction Weight Table`` + + The bitstream parameters are defined according to :ref:`h264`, + section 7.4.3.2 "Prediction Weight Table Semantics". For further + documentation, refer to the above specification, unless there is + an explicit comment stating otherwise. + +.. c:type:: v4l2_h264_pred_weight_table + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_h264_pred_weight_table + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u16 + - ``luma_log2_weight_denom`` + - + * - __u16 + - ``chroma_log2_weight_denom`` + - + * - struct :c:type:`v4l2_h264_weight_factors` + - ``weight_factors[2]`` + - The weight factors at index 0 are the weight factors for the reference + list 0, the one at index 1 for the reference list 1. + +.. c:type:: v4l2_h264_weight_factors + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_h264_weight_factors + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __s16 + - ``luma_weight[32]`` + - + * - __s16 + - ``luma_offset[32]`` + - + * - __s16 + - ``chroma_weight[32][2]`` + - + * - __s16 + - ``chroma_offset[32][2]`` + - + +``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS (struct)`` + Specifies the decode parameters (as extracted from the bitstream) + for the associated H264 slice data. This includes the necessary + parameters for configuring a stateless hardware decoding pipeline + for H264. The bitstream parameters are defined according to + :ref:`h264`. For further documentation, refer to the above + specification, unless there is an explicit comment stating + otherwise. + + .. note:: + + This compound control is not yet part of the public kernel API and + it is expected to change. + +.. c:type:: v4l2_ctrl_h264_decode_params + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_ctrl_h264_decode_params + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - struct :c:type:`v4l2_h264_dpb_entry` + - ``dpb[16]`` + - + * - __u16 + - ``num_slices`` + - Number of slices needed to decode the current frame + * - __u16 + - ``nal_ref_idc`` + - NAL reference ID value coming from the NAL Unit header + * - __u8 + - ``ref_pic_list_p0[32]`` + - Backward reference list used by P-frames in the original bitstream order + * - __u8 + - ``ref_pic_list_b0[32]`` + - Backward reference list used by B-frames in the original bitstream order + * - __u8 + - ``ref_pic_list_b1[32]`` + - Forward reference list used by B-frames in the original bitstream order + * - __s32 + - ``top_field_order_cnt`` + - Picture Order Count for the coded top field + * - __s32 + - ``bottom_field_order_cnt`` + - Picture Order Count for the coded bottom field + * - __u32 + - ``flags`` + - See :ref:`Decode Parameters Flags ` + +.. _h264_decode_params_flags: + +``Decode Parameters Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC`` + - 0x00000001 + - That picture is an IDR picture + +.. c:type:: v4l2_h264_dpb_entry + +.. cssclass:: longtable + +.. flat-table:: struct v4l2_h264_dpb_entry + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u64 + - ``reference_ts`` + - Timestamp of the V4L2 capture buffer to use as reference, used + with B-coded and P-coded frames. The timestamp refers to the + ``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the + :c:func:`v4l2_timeval_to_ns()` function to convert the struct + :c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64. + * - __u16 + - ``frame_num`` + - + * - __u16 + - ``pic_num`` + - + * - __s32 + - ``top_field_order_cnt`` + - + * - __s32 + - ``bottom_field_order_cnt`` + - + * - __u32 + - ``flags`` + - See :ref:`DPB Entry Flags ` + +.. _h264_dpb_flags: + +``DPB Entries Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_H264_DPB_ENTRY_FLAG_VALID`` + - 0x00000001 + - The DPB entry is valid and should be considered + * - ``V4L2_H264_DPB_ENTRY_FLAG_ACTIVE`` + - 0x00000002 + - The DPB entry is currently being used as a reference frame + * - ``V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM`` + - 0x00000004 + - The DPB entry is a long term reference frame .. _v4l2-mpeg-mpeg2: diff --git a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst index f824162d0ea9..dc500632095d 100644 --- a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst +++ b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst @@ -443,6 +443,36 @@ See also the examples in :ref:`control`. - n/a - A struct :c:type:`v4l2_ctrl_mpeg2_quantization`, containing MPEG-2 quantization matrices for stateless video decoders. + * - ``V4L2_CTRL_TYPE_H264_SPS`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_h264_sps`, containing H264 + sequence parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_H264_PPS`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_h264_pps`, containing H264 + picture parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_H264_SCALING_MATRIX`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_h264_scaling_matrix`, containing H264 + scaling matrices for stateless video decoders. + * - ``V4L2_CTRL_TYPE_H264_SLICE_PARAMS`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_h264_slice_params`, containing H264 + slice parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_H264_DECODE_PARAMS`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_h264_decode_params`, containing H264 + decode parameters for stateless video decoders. .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}| diff --git a/Documentation/media/videodev2.h.rst.exceptions b/Documentation/media/videodev2.h.rst.exceptions index 64d348e67df9..55cbe324b9fc 100644 --- a/Documentation/media/videodev2.h.rst.exceptions +++ b/Documentation/media/videodev2.h.rst.exceptions @@ -136,6 +136,11 @@ replace symbol V4L2_CTRL_TYPE_U32 :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_U8 :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_MPEG2_QUANTIZATION :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_H264_SPS :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_H264_PPS :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_H264_SCALING_MATRIX :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_H264_SLICE_PARAMS :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_H264_DECODE_PARAMS :c:type:`v4l2_ctrl_type` # V4L2 capability defines replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities -- cgit From 67e84a98af6505d78be9e1a383fadd8de2f9fed0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 24 May 2019 05:20:30 -0400 Subject: media: pixfmt: Add H264_SLICE_RAW format documentation The H264_SLICE_RAW format introduced before is meant for stateless decoders that will need the H264 parsed slice data without the start code. Let's document it. Signed-off-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/pixfmt-compressed.rst | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'Documentation') diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst index 6c961cfb74da..4b701fc7653e 100644 --- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst +++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst @@ -52,6 +52,31 @@ Compressed Formats - ``V4L2_PIX_FMT_H264_MVC`` - 'M264' - H264 MVC video elementary stream. + * .. _V4L2-PIX-FMT-H264-SLICE-RAW: + + - ``V4L2_PIX_FMT_H264_SLICE_RAW`` + - 'S264' + - H264 parsed slice data, without the start code and as + extracted from the H264 bitstream. This format is adapted for + stateless video decoders that implement an H264 pipeline + (using the :ref:`mem2mem` and :ref:`media-request-api`). + Metadata associated with the frame to decode are required to + be passed through the ``V4L2_CID_MPEG_VIDEO_H264_SPS``, + ``V4L2_CID_MPEG_VIDEO_H264_PPS``, + ``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX``, + ``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS`` and + ``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS`` controls. See the + :ref:`associated Codec Control IDs `. Exactly + one output and one capture buffer must be provided for use + with this pixel format. The output buffer must contain the + appropriate number of macroblocks to decode a full + corresponding frame to the matching capture buffer. + + .. note:: + + This format is not yet part of the public kernel API and it + is expected to change. + * .. _V4L2-PIX-FMT-H263: - ``V4L2_PIX_FMT_H263`` -- cgit From 156fa8845a5715199d1bb56f8bde4b403de33946 Mon Sep 17 00:00:00 2001 From: Michael Tretter Date: Tue, 28 May 2019 13:11:17 -0400 Subject: media: dt-bindings: media: document allegro-dvt bindings Add device-tree bindings for the Allegro DVT video IP core found on the Xilinx ZynqMP EV family. Signed-off-by: Michael Tretter Reviewed-by: Rob Herring Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../devicetree/bindings/media/allegro.txt | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Documentation/devicetree/bindings/media/allegro.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/media/allegro.txt b/Documentation/devicetree/bindings/media/allegro.txt new file mode 100644 index 000000000000..a92e2fbf26c9 --- /dev/null +++ b/Documentation/devicetree/bindings/media/allegro.txt @@ -0,0 +1,43 @@ +Device-tree bindings for the Allegro DVT video IP codecs present in the Xilinx +ZynqMP SoC. The IP core may either be a H.264/H.265 encoder or H.264/H.265 +decoder ip core. + +Each actual codec engines is controlled by a microcontroller (MCU). Host +software uses a provided mailbox interface to communicate with the MCU. The +MCU share an interrupt. + +Required properties: + - compatible: value should be one of the following + "allegro,al5e-1.1", "allegro,al5e": encoder IP core + "allegro,al5d-1.1", "allegro,al5d": decoder IP core + - reg: base and length of the memory mapped register region and base and + length of the memory mapped sram + - reg-names: must include "regs" and "sram" + - interrupts: shared interrupt from the MCUs to the processing system + - clocks: must contain an entry for each entry in clock-names + - clock-names: must include "core_clk", "mcu_clk", "m_axi_core_aclk", + "m_axi_mcu_aclk", "s_axi_lite_aclk" + +Example: + al5e: video-codec@a0009000 { + compatible = "allegro,al5e-1.1", "allegro,al5e"; + reg = <0 0xa0009000 0 0x1000>, + <0 0xa0000000 0 0x8000>; + reg-names = "regs", "sram"; + interrupts = <0 96 4>; + clocks = <&xlnx_vcu 0>, <&xlnx_vcu 1>, + <&clkc 71>, <&clkc 71>, <&clkc 71>; + clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk", + "m_axi_mcu_aclk", "s_axi_lite_aclk" + }; + al5d: video-codec@a0029000 { + compatible = "allegro,al5d-1.1", "allegro,al5d"; + reg = <0 0xa0029000 0 0x1000>, + <0 0xa0020000 0 0x8000>; + reg-names = "regs", "sram"; + interrupts = <0 96 4>; + clocks = <&xlnx_vcu 2>, <&xlnx_vcu 3>, + <&clkc 71>, <&clkc 71>, <&clkc 71>; + clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk", + "m_axi_mcu_aclk", "s_axi_lite_aclk" + }; -- cgit From 8df39e16877ffa7a055a2c35c86df69c37ad73a9 Mon Sep 17 00:00:00 2001 From: Michael Tretter Date: Tue, 28 May 2019 13:11:18 -0400 Subject: media: dt-bindings: media: Add vendor prefix for allegro Add vendor prefix for Allegro DVT, a provider of H.264/AVC, H.265/HEVC, AVS2, VP9 and AV1 compliance test suites and H.264/AVC, H.265/HEVC, and VP9 encoder, codec and decoder hardware (RTL) IPs. Signed-off-by: Michael Tretter Reviewed-by: Rob Herring Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 33a65a45e319..7da11464991a 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -49,6 +49,8 @@ patternProperties: description: Aeroflex Gaisler AB "^al,.*": description: Annapurna Labs + "^allegro,.*" + description: Allegro DVT "^allo,.*": description: Allo.com "^allwinner,.*": -- cgit From 97a1aa00c178589a62b973848cfb40132793a1ec Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 6 Jun 2019 15:13:10 +0300 Subject: docs/habanalabs: update text for some entries in sysfs This patch updates the description of some entries in sysfs for the habanalabs driver. Signed-off-by: Oded Gabbay --- Documentation/ABI/testing/sysfs-driver-habanalabs | 42 +++++++++++++---------- 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs index 78b2bcf316a3..f433fc6db3c6 100644 --- a/Documentation/ABI/testing/sysfs-driver-habanalabs +++ b/Documentation/ABI/testing/sysfs-driver-habanalabs @@ -62,18 +62,20 @@ What: /sys/class/habanalabs/hl/ic_clk Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Allows the user to set the maximum clock frequency of the - Interconnect fabric. Writes to this parameter affect the device - only when the power management profile is set to "manual" mode. - The device IC clock might be set to lower value then the +Description: Allows the user to set the maximum clock frequency, in Hz, of + the Interconnect fabric. Writes to this parameter affect the + device only when the power management profile is set to "manual" + mode. The device IC clock might be set to lower value than the maximum. The user should read the ic_clk_curr to see the actual - frequency value of the IC + frequency value of the IC. This property is valid only for the + Goya ASIC family What: /sys/class/habanalabs/hl/ic_clk_curr Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Displays the current clock frequency of the Interconnect fabric +Description: Displays the current clock frequency, in Hz, of the Interconnect + fabric. This property is valid only for the Goya ASIC family What: /sys/class/habanalabs/hl/infineon_ver Date: Jan 2019 @@ -92,18 +94,20 @@ What: /sys/class/habanalabs/hl/mme_clk Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Allows the user to set the maximum clock frequency of the - MME compute engine. Writes to this parameter affect the device - only when the power management profile is set to "manual" mode. - The device MME clock might be set to lower value then the +Description: Allows the user to set the maximum clock frequency, in Hz, of + the MME compute engine. Writes to this parameter affect the + device only when the power management profile is set to "manual" + mode. The device MME clock might be set to lower value than the maximum. The user should read the mme_clk_curr to see the actual - frequency value of the MME + frequency value of the MME. This property is valid only for the + Goya ASIC family What: /sys/class/habanalabs/hl/mme_clk_curr Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Displays the current clock frequency of the MME compute engine +Description: Displays the current clock frequency, in Hz, of the MME compute + engine. This property is valid only for the Goya ASIC family What: /sys/class/habanalabs/hl/pci_addr Date: Jan 2019 @@ -163,18 +167,20 @@ What: /sys/class/habanalabs/hl/tpc_clk Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Allows the user to set the maximum clock frequency of the - TPC compute engines. Writes to this parameter affect the device - only when the power management profile is set to "manual" mode. - The device TPC clock might be set to lower value then the +Description: Allows the user to set the maximum clock frequency, in Hz, of + the TPC compute engines. Writes to this parameter affect the + device only when the power management profile is set to "manual" + mode. The device TPC clock might be set to lower value than the maximum. The user should read the tpc_clk_curr to see the actual - frequency value of the TPC + frequency value of the TPC. This property is valid only for + Goya ASIC family What: /sys/class/habanalabs/hl/tpc_clk_curr Date: Jan 2019 KernelVersion: 5.1 Contact: oded.gabbay@gmail.com -Description: Displays the current clock frequency of the TPC compute engines +Description: Displays the current clock frequency, in Hz, of the TPC compute + engines. This property is valid only for the Goya ASIC family What: /sys/class/habanalabs/hl/uboot_ver Date: Jan 2019 -- cgit From 3e6a515ff4d40b690511a250ceb11a6e1eba4081 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 27 May 2019 05:11:32 -0400 Subject: media: media-ioc-enum-links.rst: fix incorrect reserved field documentation The reserved field array for struct media_link_desc has length 2, not 4. And the reserved field array of struct media_links_enum was never documented at all. Signed-off-by: Hans Verkuil Acked-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/mediactl/media-ioc-enum-links.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst index a982f16e55a4..b827ebc398f8 100644 --- a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst +++ b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst @@ -84,6 +84,11 @@ returned during the enumeration process. - Pointer to a links array allocated by the application. Ignored if NULL. + * - __u32 + - ``reserved[4]`` + - Reserved for future extensions. Drivers and applications must set + the array to zero. + .. c:type:: media_pad_desc @@ -135,7 +140,7 @@ returned during the enumeration process. - Link flags, see :ref:`media-link-flag` for more details. * - __u32 - - ``reserved[4]`` + - ``reserved[2]`` - Reserved for future extensions. Drivers and applications must set the array to zero. -- cgit From 0a0c2a9262a145570751111603d1f0385ce1a443 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 20 May 2019 11:06:35 -0400 Subject: media: dt-bindings: media: sun6i-csi: Add compatible string for A83T variant The A83T SoC has a camera sensor interface (known as CSI in Allwinner lingo), which is similar to the one found on the A64 and H3. The only difference seems to be that support of MIPI CSI through a connected MIPI CSI-2 bridge. Add a compatible string for this variant. Signed-off-by: Chen-Yu Tsai Acked-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- Documentation/devicetree/bindings/media/sun6i-csi.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/media/sun6i-csi.txt b/Documentation/devicetree/bindings/media/sun6i-csi.txt index 0dd540bb03db..a2e3e56f0257 100644 --- a/Documentation/devicetree/bindings/media/sun6i-csi.txt +++ b/Documentation/devicetree/bindings/media/sun6i-csi.txt @@ -6,6 +6,7 @@ Allwinner V3s SoC features a CSI module(CSI1) with parallel interface. Required properties: - compatible: value must be one of: * "allwinner,sun6i-a31-csi" + * "allwinner,sun8i-a83t-csi" * "allwinner,sun8i-h3-csi" * "allwinner,sun8i-v3s-csi" * "allwinner,sun50i-a64-csi" -- cgit From 29a50257a9d6d16320bc5e2fc1b15e0f2b2eb7cf Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 28 May 2019 17:57:09 -0700 Subject: dt-bindings: PCI: qcom: Add QCS404 to the binding The Qualcomm QCS404 platform contains a PCIe controller, add this to the Qualcomm PCI binding document. The controller is the same version as the one used in IPQ4019, but the PHY part is described separately, hence the difference in clocks and resets. Signed-off-by: Bjorn Andersson Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Vinod Koul --- .../devicetree/bindings/pci/qcom,pcie.txt | 25 ++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt index 1fd703bd73e0..ada80b01bf0c 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt @@ -10,6 +10,7 @@ - "qcom,pcie-msm8996" for msm8996 or apq8096 - "qcom,pcie-ipq4019" for ipq4019 - "qcom,pcie-ipq8074" for ipq8074 + - "qcom,pcie-qcs404" for qcs404 - reg: Usage: required @@ -116,6 +117,15 @@ - "ahb" AHB clock - "aux" Auxiliary clock +- clock-names: + Usage: required for qcs404 + Value type: + Definition: Should contain the following entries + - "iface" AHB clock + - "aux" Auxiliary clock + - "master_bus" AXI Master clock + - "slave_bus" AXI Slave clock + - resets: Usage: required Value type: @@ -167,6 +177,17 @@ - "ahb" AHB Reset - "axi_m_sticky" AXI Master Sticky reset +- reset-names: + Usage: required for qcs404 + Value type: + Definition: Should contain the following entries + - "axi_m" AXI Master reset + - "axi_s" AXI Slave reset + - "axi_m_sticky" AXI Master Sticky reset + - "pipe_sticky" PIPE sticky reset + - "pwr" PWR reset + - "ahb" AHB reset + - power-domains: Usage: required for apq8084 and msm8996/apq8096 Value type: @@ -195,12 +216,12 @@ Definition: A phandle to the PCIe endpoint power supply - phys: - Usage: required for apq8084 + Usage: required for apq8084 and qcs404 Value type: Definition: List of phandle(s) as listed in phy-names property - phy-names: - Usage: required for apq8084 + Usage: required for apq8084 and qcs404 Value type: Definition: Should contain "pciephy" -- cgit From 42f6ebd827832e62a37350ffad776ea785a2486b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 23 May 2019 07:43:43 -0300 Subject: docs: cdomain.py: get rid of a warning since version 1.8 There's a new warning about a deprecation function. Add a logic at cdomain.py to avoid that. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/sphinx/cdomain.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py index cf13ff3a656c..cbac8e608dc4 100644 --- a/Documentation/sphinx/cdomain.py +++ b/Documentation/sphinx/cdomain.py @@ -48,7 +48,10 @@ major, minor, patch = sphinx.version_info[:3] def setup(app): - app.override_domain(CDomain) + if (major == 1 and minor < 8): + app.override_domain(CDomain) + else: + app.add_domain(CDomain, override=True) return dict( version = __version__, -- cgit From fe4ec72cca500b2f97ffa0429b4cd57f67e0821d Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 21 May 2019 21:30:00 +0900 Subject: docs: tracing: Fix typos in histogram.rst This patch fixes some spelling typos in histogram.rst Signed-off-by: Masanari Iida Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- Documentation/trace/histogram.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index fb621a1c2638..8408670d0328 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1010,7 +1010,7 @@ Extended error information For example, suppose we wanted to take a look at the relative weights in terms of skb length for each callpath that leads to a - netif_receieve_skb event when downloading a decent-sized file using + netif_receive_skb event when downloading a decent-sized file using wget. First we set up an initially paused stacktrace trigger on the @@ -1843,7 +1843,7 @@ practice, not every handler.action combination is currently supported; if a given handler.action combination isn't supported, the hist trigger will fail with -EINVAL; -The default 'handler.action' if none is explicity specified is as it +The default 'handler.action' if none is explicitly specified is as it always has been, to simply update the set of values associated with an entry. Some applications, however, may want to perform additional actions at that point, such as generate another event, or compare and @@ -2088,7 +2088,7 @@ The following commonly-used handler.action pairs are available: and the saved values corresponding to the max are displayed following the rest of the fields. - If a snaphot was taken, there is also a message indicating that, + If a snapshot was taken, there is also a message indicating that, along with the value and event that triggered the global maximum: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist @@ -2176,7 +2176,7 @@ The following commonly-used handler.action pairs are available: hist trigger entry. Note that in this case the changed value is a global variable - associated withe current trace instance. The key of the specific + associated with current trace instance. The key of the specific trace event that caused the value to change and the global value itself are displayed, along with a message stating that a snapshot has been taken and where to find it. The user can use the key @@ -2203,7 +2203,7 @@ The following commonly-used handler.action pairs are available: and the saved values corresponding to that value are displayed following the rest of the fields. - If a snaphot was taken, there is also a message indicating that, + If a snapshot was taken, there is also a message indicating that, along with the value and event that triggered the snapshot:: # cat /sys/kernel/debug/tracing/events/tcp/tcp_probe/hist -- cgit From 93285c01977729a2e046e065e4b99791b966130c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 21 May 2019 10:32:08 +0800 Subject: doc: kernel-parameters.txt: fix documentation of nmi_watchdog parameter The default behavior of hardlockup depends on the config of CONFIG_BOOTPARAM_HARDLOCKUP_PANIC. Fix the description of nmi_watchdog to make it clear. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Zhenzhong Duan Reviewed-by: Joel Fernandes (Google) Acked-by: Ingo Molnar Acked-by: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Kees Cook Cc: Greg Kroah-Hartman Cc: linux-doc@vger.kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..79d043b8850d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2836,8 +2836,9 @@ 0 - turn hardlockup detector in nmi_watchdog off 1 - turn hardlockup detector in nmi_watchdog on When panic is specified, panic when an NMI watchdog - timeout occurs (or 'nopanic' to override the opposite - default). To disable both hard and soft lockup detectors, + timeout occurs (or 'nopanic' to not panic on an NMI + watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) + To disable both hard and soft lockup detectors, please see 'nowatchdog'. This is useful when you use a panic=... timeout and need the box quickly up again. -- cgit From 50c1f43a37d006ac24755397614b00064a8f293a Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:05 +1000 Subject: docs: filesystems: vfs: Remove space before tab Currently the file has a bunch of spaces before tabspaces. This is a nuisance when patching the file because they show up whenever we touch these lines. Let's just fix them all now in preparation for doing the RST conversion. Remove spaces before tabspaces. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 78 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 57fc576b1f3e..cab5a36f39c6 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -134,7 +134,7 @@ struct file_system_type { should be shut down owner: for internal VFS use: you should initialize this to THIS_MODULE in - most cases. + most cases. next: for internal VFS use: you should initialize this to NULL @@ -143,7 +143,7 @@ struct file_system_type { The mount() method has the following arguments: struct file_system_type *fs_type: describes the filesystem, partly initialized - by the specific filesystem code + by the specific filesystem code int flags: mount flags @@ -180,12 +180,12 @@ and provides a fill_super() callback instead. The generic variants are: mount_nodev: mount a filesystem that is not backed by a device mount_single: mount a filesystem which shares the instance between - all mounts + all mounts A fill_super() callback implementation has the following arguments: struct super_block *sb: the superblock structure. The callback - must initialize this properly. + must initialize this properly. void *data: arbitrary mount options, usually comes as an ASCII string (see "Mount Options" section) @@ -236,14 +236,14 @@ only called from a process context (i.e. not from an interrupt handler or bottom half). alloc_inode: this method is called by alloc_inode() to allocate memory - for struct inode and initialize it. If this function is not - defined, a simple 'struct inode' is allocated. Normally - alloc_inode will be used to allocate a larger structure which - contains a 'struct inode' embedded within it. + for struct inode and initialize it. If this function is not + defined, a simple 'struct inode' is allocated. Normally + alloc_inode will be used to allocate a larger structure which + contains a 'struct inode' embedded within it. destroy_inode: this method is called by destroy_inode() to release - resources allocated for struct inode. It is only required if - ->alloc_inode was defined and simply undoes anything done by + resources allocated for struct inode. It is only required if + ->alloc_inode was defined and simply undoes anything done by ->alloc_inode. dirty_inode: this method is called by the VFS to mark an inode dirty. @@ -271,15 +271,15 @@ or bottom half). (i.e. unmount). This is called with the superblock lock held sync_fs: called when VFS is writing out all dirty data associated with - a superblock. The second parameter indicates whether the method + a superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. freeze_fs: called when VFS is locking a filesystem and - forcing it into a consistent state. This method is currently - used by the Logical Volume Manager (LVM). + forcing it into a consistent state. This method is currently + used by the Logical Volume Manager (LVM). unfreeze_fs: called when VFS is unlocking a filesystem and making it writable - again. + again. statfs: called when the VFS needs to get filesystem statistics. @@ -476,30 +476,30 @@ otherwise noted. that. permission: called by the VFS to check for access rights on a POSIX-like - filesystem. + filesystem. May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or + mode, the filesystem must check the permission without blocking or storing to the inode. If a situation is encountered that rcu-walk cannot handle, return -ECHILD and it will be called again in ref-walk mode. setattr: called by the VFS to set attributes for a file. This method - is called by chmod(2) and related system calls. + is called by chmod(2) and related system calls. getattr: called by the VFS to get attributes of a file. This method - is called by stat(2) and related system calls. + is called by stat(2) and related system calls. listxattr: called by the VFS to list all extended attributes for a given file. This method is called by the listxattr(2) system call. update_time: called by the VFS to update a specific time or the i_version of - an inode. If this is not defined the VFS will update the inode itself - and call mark_inode_dirty_sync. + an inode. If this is not defined the VFS will update the inode itself + and call mark_inode_dirty_sync. atomic_open: called on the last component of an open. Using this optional - method the filesystem can look up, possibly create and open the file in + method the filesystem can look up, possibly create and open the file in one atomic operation. If it wants to leave actual opening to the caller (e.g. if the file turned out to be a symlink, device, or just something filesystem won't do atomic open for), it may signal this by @@ -687,13 +687,13 @@ struct address_space_operations { that all succeeds, ->readpage will be called again. writepages: called by the VM to write out pages associated with the - address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then - the writeback_control will specify a range of pages that must be - written out. If it is WBC_SYNC_NONE, then a nr_to_write is given + address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then + the writeback_control will specify a range of pages that must be + written out. If it is WBC_SYNC_NONE, then a nr_to_write is given and that many pages should be written if possible. If no ->writepages is given, then mpage_writepages is used - instead. This will choose pages from the address space that are - tagged as DIRTY and will pass them to ->writepage. + instead. This will choose pages from the address space that are + tagged as DIRTY and will pass them to ->writepage. set_page_dirty: called by the VM to set a page dirty. This is particularly needed if an address space attaches @@ -704,11 +704,11 @@ struct address_space_operations { PAGECACHE_TAG_DIRTY tag in the radix tree. readpages: called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of - readpage. Instead of just one page, several pages are - requested. + object. This is essentially just a vector version of + readpage. Instead of just one page, several pages are + requested. readpages is only used for read-ahead, so read errors are - ignored. If anything goes wrong, feel free to give up. + ignored. If anything goes wrong, feel free to give up. write_begin: Called by the generic buffered write code to ask the filesystem to @@ -745,12 +745,12 @@ struct address_space_operations { that were able to be copied into pagecache. bmap: called by the VFS to map a logical block offset within object to - physical block number. This method is used by the FIBMAP - ioctl and for working with swap-files. To be able to swap to - a file, the file must have a stable mapping to a block - device. The swap system does not go through the filesystem - but instead uses bmap to find out where the blocks in the file - are and uses those addresses directly. + physical block number. This method is used by the FIBMAP + ioctl and for working with swap-files. To be able to swap to + a file, the file must have a stable mapping to a block + device. The swap system does not go through the filesystem + but instead uses bmap to find out where the blocks in the file + are and uses those addresses directly. invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed @@ -810,7 +810,7 @@ struct address_space_operations { putback_page: Called by the VM when isolated page's migration fails. launder_page: Called before freeing a page - it writes back the dirty page. To - prevent redirtying the page, it is kept locked during the whole + prevent redirtying the page, it is kept locked during the whole operation. is_partially_uptodate: Called by the VM when reading a file through the @@ -921,7 +921,7 @@ otherwise noted. unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls - are used on 64 bit kernels. + are used on 64 bit kernels. mmap: called by the mmap(2) system call @@ -946,7 +946,7 @@ otherwise noted. (non-blocking) mode is enabled for a file lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW - commands + commands get_unmapped_area: called by the mmap(2) system call -- cgit From 4ee33ea403ac7c1f2b04534132ebb9c3c5095b56 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:06 +1000 Subject: docs: filesystems: vfs: Use uniform space after period. Currently sometimes document has a single space after a period and sometimes it has double. Whichever we use it should be uniform. Use double space after period, be uniform. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 246 +++++++++++++++++++------------------- 1 file changed, 123 insertions(+), 123 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index cab5a36f39c6..6088b925aa7f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -14,12 +14,12 @@ Introduction The Virtual File System (also known as the Virtual Filesystem Switch) is the software layer in the kernel that provides the filesystem -interface to userspace programs. It also provides an abstraction +interface to userspace programs. It also provides an abstraction within the kernel which allows different filesystem implementations to coexist. VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so -on are called from a process context. Filesystem locking is described +on are called from a process context. Filesystem locking is described in the document Documentation/filesystems/Locking. @@ -27,37 +27,37 @@ Directory Entry Cache (dcache) ------------------------------ The VFS implements the open(2), stat(2), chmod(2), and similar system -calls. The pathname argument that is passed to them is used by the VFS +calls. The pathname argument that is passed to them is used by the VFS to search through the directory entry cache (also known as the dentry -cache or dcache). This provides a very fast look-up mechanism to -translate a pathname (filename) into a specific dentry. Dentries live +cache or dcache). This provides a very fast look-up mechanism to +translate a pathname (filename) into a specific dentry. Dentries live in RAM and are never saved to disc: they exist only for performance. -The dentry cache is meant to be a view into your entire filespace. As +The dentry cache is meant to be a view into your entire filespace. As most computers cannot fit all dentries in the RAM at the same time, -some bits of the cache are missing. In order to resolve your pathname +some bits of the cache are missing. In order to resolve your pathname into a dentry, the VFS may have to resort to creating dentries along -the way, and then loading the inode. This is done by looking up the +the way, and then loading the inode. This is done by looking up the inode. The Inode Object ---------------- -An individual dentry usually has a pointer to an inode. Inodes are +An individual dentry usually has a pointer to an inode. Inodes are filesystem objects such as regular files, directories, FIFOs and other beasts. They live either on the disc (for block device filesystems) -or in the memory (for pseudo filesystems). Inodes that live on the +or in the memory (for pseudo filesystems). Inodes that live on the disc are copied into the memory when required and changes to the inode -are written back to disc. A single inode can be pointed to by multiple +are written back to disc. A single inode can be pointed to by multiple dentries (hard links, for example, do this). To look up an inode requires that the VFS calls the lookup() method of -the parent directory inode. This method is installed by the specific -filesystem implementation that the inode lives in. Once the VFS has +the parent directory inode. This method is installed by the specific +filesystem implementation that the inode lives in. Once the VFS has the required dentry (and hence the inode), we can do all those boring things like open(2) the file, or stat(2) it to peek at the inode -data. The stat(2) operation is fairly simple: once the VFS has the +data. The stat(2) operation is fairly simple: once the VFS has the dentry, it peeks at the inode data and passes some of it back to userspace. @@ -67,17 +67,17 @@ The File Object Opening a file requires another operation: allocation of a file structure (this is the kernel-side implementation of file -descriptors). The freshly allocated file structure is initialized with +descriptors). The freshly allocated file structure is initialized with a pointer to the dentry and a set of file operation member functions. -These are taken from the inode data. The open() file method is then -called so the specific filesystem implementation can do its work. You -can see that this is another switch performed by the VFS. The file +These are taken from the inode data. The open() file method is then +called so the specific filesystem implementation can do its work. You +can see that this is another switch performed by the VFS. The file structure is placed into the file descriptor table for the process. Reading, writing and closing files (and other assorted VFS operations) is done by using the userspace file descriptor to grab the appropriate file structure, and then calling the required file structure method to -do whatever is required. For as long as the file is open, it keeps the +do whatever is required. For as long as the file is open, it keeps the dentry in use, which in turn means that the VFS inode is still in use. @@ -92,7 +92,7 @@ functions: extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); -The passed struct file_system_type describes your filesystem. When a +The passed struct file_system_type describes your filesystem. When a request is made to mount a filesystem onto a directory in your namespace, the VFS will call the appropriate mount() method for the specific filesystem. New vfsmount referring to the tree returned by ->mount() @@ -106,7 +106,7 @@ file /proc/filesystems. struct file_system_type ----------------------- -This describes the filesystem. As of kernel 2.6.39, the following +This describes the filesystem. As of kernel 2.6.39, the following members are defined: struct file_system_type { @@ -168,12 +168,12 @@ point of view is a reference to dentry at the root of (sub)tree to be attached; creation of new superblock is a common side effect. The most interesting member of the superblock structure that the -mount() method fills in is the "s_op" field. This is a pointer to +mount() method fills in is the "s_op" field. This is a pointer to a "struct super_operations" which describes the next level of the filesystem implementation. Usually, a filesystem uses one of the generic mount() implementations -and provides a fill_super() callback instead. The generic variants are: +and provides a fill_super() callback instead. The generic variants are: mount_bdev: mount a filesystem residing on a block device @@ -184,7 +184,7 @@ and provides a fill_super() callback instead. The generic variants are: A fill_super() callback implementation has the following arguments: - struct super_block *sb: the superblock structure. The callback + struct super_block *sb: the superblock structure. The callback must initialize this properly. void *data: arbitrary mount options, usually comes as an ASCII @@ -203,7 +203,7 @@ struct super_operations ----------------------- This describes how the VFS can manipulate the superblock of your -filesystem. As of kernel 2.6.22, the following members are defined: +filesystem. As of kernel 2.6.22, the following members are defined: struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); @@ -231,7 +231,7 @@ struct super_operations { }; All methods are called without any locks being held, unless otherwise -noted. This means that most methods can block safely. All methods are +noted. This means that most methods can block safely. All methods are only called from a process context (i.e. not from an interrupt handler or bottom half). @@ -268,11 +268,11 @@ or bottom half). delete_inode: called when the VFS wants to delete an inode put_super: called when the VFS wishes to free the superblock - (i.e. unmount). This is called with the superblock lock held + (i.e. unmount). This is called with the superblock lock held sync_fs: called when VFS is writing out all dirty data associated with - a superblock. The second parameter indicates whether the method - should wait until the write out has been completed. Optional. + a superblock. The second parameter indicates whether the method + should wait until the write out has been completed. Optional. freeze_fs: called when VFS is locking a filesystem and forcing it into a consistent state. This method is currently @@ -283,10 +283,10 @@ or bottom half). statfs: called when the VFS needs to get filesystem statistics. - remount_fs: called when the filesystem is remounted. This is called + remount_fs: called when the filesystem is remounted. This is called with the kernel lock held - clear_inode: called then the VFS clears the inode. Optional + clear_inode: called then the VFS clears the inode. Optional umount_begin: called when the VFS is unmounting a filesystem. @@ -307,17 +307,17 @@ or bottom half). implement ->nr_cached_objects for it to be called correctly. We can't do anything with any errors that the filesystem might - encountered, hence the void return type. This will never be called if + encountered, hence the void return type. This will never be called if the VM is trying to reclaim under GFP_NOFS conditions, hence this method does not need to handle that situation itself. Implementations must include conditional reschedule calls inside any - scanning loop that is done. This allows the VFS to determine + scanning loop that is done. This allows the VFS to determine appropriate scan batch sizes without having to worry about whether implementations will cause holdoff problems due to large scan batch sizes. -Whoever sets up the inode is responsible for filling in the "i_op" field. This +Whoever sets up the inode is responsible for filling in the "i_op" field. This is a pointer to a "struct inode_operations" which describes the methods that can be performed on individual inodes. @@ -361,7 +361,7 @@ struct inode_operations ----------------------- This describes how the VFS can manipulate an inode in your -filesystem. As of kernel 2.6.22, the following members are defined: +filesystem. As of kernel 2.6.22, the following members are defined: struct inode_operations { int (*create) (struct inode *,struct dentry *, umode_t, bool); @@ -391,19 +391,19 @@ struct inode_operations { Again, all methods are called without any locks being held, unless otherwise noted. - create: called by the open(2) and creat(2) system calls. Only - required if you want to support regular files. The dentry you + create: called by the open(2) and creat(2) system calls. Only + required if you want to support regular files. The dentry you get should not have an inode (i.e. it should be a negative - dentry). Here you will probably call d_instantiate() with the + dentry). Here you will probably call d_instantiate() with the dentry and the newly created inode lookup: called when the VFS needs to look up an inode in a parent - directory. The name to look for is found in the dentry. This + directory. The name to look for is found in the dentry. This method must call d_add() to insert the found inode into the - dentry. The "i_count" field in the inode structure should be - incremented. If the named inode does not exist a NULL inode + dentry. The "i_count" field in the inode structure should be + incremented. If the named inode does not exist a NULL inode should be inserted into the dentry (this is called a negative - dentry). Returning an error code from this routine must only + dentry). Returning an error code from this routine must only be done on a real error, otherwise creating inodes with system calls like create(2), mknod(2), mkdir(2) and so on will fail. If you wish to overload the dentry methods then you should @@ -411,27 +411,27 @@ otherwise noted. to a struct "dentry_operations". This method is called with the directory inode semaphore held - link: called by the link(2) system call. Only required if you want - to support hard links. You will probably need to call + link: called by the link(2) system call. Only required if you want + to support hard links. You will probably need to call d_instantiate() just as you would in the create() method - unlink: called by the unlink(2) system call. Only required if you + unlink: called by the unlink(2) system call. Only required if you want to support deleting inodes - symlink: called by the symlink(2) system call. Only required if you - want to support symlinks. You will probably need to call + symlink: called by the symlink(2) system call. Only required if you + want to support symlinks. You will probably need to call d_instantiate() just as you would in the create() method - mkdir: called by the mkdir(2) system call. Only required if you want - to support creating subdirectories. You will probably need to + mkdir: called by the mkdir(2) system call. Only required if you want + to support creating subdirectories. You will probably need to call d_instantiate() just as you would in the create() method - rmdir: called by the rmdir(2) system call. Only required if you want + rmdir: called by the rmdir(2) system call. Only required if you want to support deleting subdirectories mknod: called by the mknod(2) system call to create a device (char, - block) inode or a named pipe (FIFO) or socket. Only required - if you want to support creating these types of inodes. You + block) inode or a named pipe (FIFO) or socket. Only required + if you want to support creating these types of inodes. You will probably need to call d_instantiate() just as you would in the create() method @@ -478,21 +478,21 @@ otherwise noted. permission: called by the VFS to check for access rights on a POSIX-like filesystem. - May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or + May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk + mode, the filesystem must check the permission without blocking or storing to the inode. If a situation is encountered that rcu-walk cannot handle, return -ECHILD and it will be called again in ref-walk mode. - setattr: called by the VFS to set attributes for a file. This method + setattr: called by the VFS to set attributes for a file. This method is called by chmod(2) and related system calls. - getattr: called by the VFS to get attributes of a file. This method + getattr: called by the VFS to get attributes of a file. This method is called by stat(2) and related system calls. listxattr: called by the VFS to list all extended attributes for a - given file. This method is called by the listxattr(2) system call. + given file. This method is called by the listxattr(2) system call. update_time: called by the VFS to update a specific time or the i_version of an inode. If this is not defined the VFS will update the inode itself @@ -530,7 +530,7 @@ The first can be used independently to the others. The VM can try to either write dirty pages in order to clean them, or release clean pages in order to reuse them. To do this it can call the ->writepage method on dirty pages, and ->releasepage on clean pages with -PagePrivate set. Clean pages without PagePrivate and with no external +PagePrivate set. Clean pages without PagePrivate and with no external references will be released without notice being given to the address_space. @@ -538,7 +538,7 @@ To achieve this functionality, pages need to be placed on an LRU with lru_cache_add and mark_page_active needs to be called whenever the page is used. -Pages are normally kept in a radix tree index by ->index. This tree +Pages are normally kept in a radix tree index by ->index. This tree maintains information about the PG_Dirty and PG_Writeback status of each page, so that pages with either of these flags can be found quickly. @@ -624,7 +624,7 @@ struct address_space_operations ------------------------------- This describes how the VFS can manipulate mapping of a file to page cache in -your filesystem. The following members are defined: +your filesystem. The following members are defined: struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); @@ -704,7 +704,7 @@ struct address_space_operations { PAGECACHE_TAG_DIRTY tag in the radix tree. readpages: called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of + object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are @@ -712,7 +712,7 @@ struct address_space_operations { write_begin: Called by the generic buffered write code to ask the filesystem to - prepare to write len bytes at the given offset in the file. The + prepare to write len bytes at the given offset in the file. The address_space should check that the write will be able to complete, by allocating space if necessary and doing any other internal housekeeping. If the write will update parts of any basic-blocks on @@ -735,7 +735,7 @@ struct address_space_operations { which case write_end is not called. write_end: After a successful write_begin, and data copy, write_end must - be called. len is the original len passed to write_begin, and copied + be called. len is the original len passed to write_begin, and copied is the amount that was able to be copied. The filesystem must take care of unlocking the page and releasing it @@ -745,7 +745,7 @@ struct address_space_operations { that were able to be copied into pagecache. bmap: called by the VFS to map a logical block offset within object to - physical block number. This method is used by the FIBMAP + physical block number. This method is used by the FIBMAP ioctl and for working with swap-files. To be able to swap to a file, the file must have a stable mapping to a block device. The swap system does not go through the filesystem @@ -757,7 +757,7 @@ struct address_space_operations { from the address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_SIZE). Any private data associated with the page + will be PAGE_SIZE). Any private data associated with the page should be updated to reflect this truncation. If offset is 0 and length is PAGE_SIZE, then the private data should be released, because the page must be able to be completely discarded. This may @@ -767,7 +767,7 @@ struct address_space_operations { releasepage: releasepage is called on PagePrivate pages to indicate that the page should be freed if possible. ->releasepage should remove any private data from the page and clear the - PagePrivate flag. If releasepage() fails for some reason, it must + PagePrivate flag. If releasepage() fails for some reason, it must indicate failure with a 0 return value. releasepage() is used in two distinct though related cases. The first is when the VM finds a clean page with no active users and @@ -787,7 +787,7 @@ struct address_space_operations { freepage: freepage is called once the page is no longer visible in the page cache in order to allow the cleanup of any private - data. Since it may be called by the memory reclaimer, it + data. Since it may be called by the memory reclaimer, it should not assume that the original address_space mapping still exists, and it should not block. @@ -809,32 +809,32 @@ struct address_space_operations { putback_page: Called by the VM when isolated page's migration fails. - launder_page: Called before freeing a page - it writes back the dirty page. To + launder_page: Called before freeing a page - it writes back the dirty page. To prevent redirtying the page, it is kept locked during the whole operation. is_partially_uptodate: Called by the VM when reading a file through the - pagecache when the underlying blocksize != pagesize. If the required + pagecache when the underlying blocksize != pagesize. If the required block is up to date then the read can complete without needing the IO to bring the whole page up to date. is_dirty_writeback: Called by the VM when attempting to reclaim a page. The VM uses dirty and writeback information to determine if it needs - to stall to allow flushers a chance to complete some IO. Ordinarily + to stall to allow flushers a chance to complete some IO. Ordinarily it can use PageDirty and PageWriteback but some filesystems have more complex state (unstable pages in NFS prevent reclaim) or - do not set those flags due to locking problems. This callback + do not set those flags due to locking problems. This callback allows a filesystem to indicate to the VM if a page should be treated as dirty or writeback for the purposes of stalling. error_remove_page: normally set to generic_error_remove_page if truncation - is ok for this address space. Used for memory failure handling. + is ok for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. swap_activate: Called when swapon is used on a file to allocate space if necessary and pin the block lookup information in - memory. A return value of zero indicates success, + memory. A return value of zero indicates success, in which case this file can be used to back swapspace. swap_deactivate: Called during swapoff on files where swap_activate @@ -844,14 +844,14 @@ struct address_space_operations { The File Object =============== -A file object represents a file opened by a process. This is also known +A file object represents a file opened by a process. This is also known as an "open file description" in POSIX parlance. struct file_operations ---------------------- -This describes how the VFS can manipulate an open file. As of kernel +This describes how the VFS can manipulate an open file. As of kernel 4.18, the following members are defined: struct file_operations { @@ -916,7 +916,7 @@ otherwise noted. poll: called by the VFS when a process wants to check if there is activity on this file and (optionally) go to sleep until there - is activity. Called by the select(2) and poll(2) system calls + is activity. Called by the select(2) and poll(2) system calls unlocked_ioctl: called by the ioctl(2) system call. @@ -925,13 +925,13 @@ otherwise noted. mmap: called by the mmap(2) system call - open: called by the VFS when an inode should be opened. When the VFS - opens a file, it creates a new "struct file". It then calls the - open method for the newly allocated file structure. You might + open: called by the VFS when an inode should be opened. When the VFS + opens a file, it creates a new "struct file". It then calls the + open method for the newly allocated file structure. You might think that the open method really belongs in - "struct inode_operations", and you may be right. I think it's + "struct inode_operations", and you may be right. I think it's done the way it is because it makes filesystems simpler to - implement. The open() method is a good place to initialize the + implement. The open() method is a good place to initialize the "private_data" member in the file structure if you want to point to a device structure @@ -939,7 +939,7 @@ otherwise noted. release: called when the last reference to an open file is closed - fsync: called by the fsync(2) system call. Also see the section above + fsync: called by the fsync(2) system call. Also see the section above entitled "Handling errors during writeback". fasync: called by the fcntl(2) system call when asynchronous @@ -954,13 +954,13 @@ otherwise noted. flock: called by the flock(2) system call - splice_write: called by the VFS to splice data from a pipe to a file. This + splice_write: called by the VFS to splice data from a pipe to a file. This method is used by the splice(2) system call - splice_read: called by the VFS to splice data from file to a pipe. This + splice_read: called by the VFS to splice data from file to a pipe. This method is used by the splice(2) system call - setlease: called by the VFS to set or release a file lock lease. setlease + setlease: called by the VFS to set or release a file lock lease. setlease implementations should call generic_setlease to record or remove the lease in the inode after setting it. @@ -984,12 +984,12 @@ otherwise noted. fadvise: possibly called by the fadvise64() system call. Note that the file operations are implemented by the specific -filesystem in which the inode resides. When opening a device node +filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special support routines in the VFS which will locate the required device -driver information. These support routines replace the filesystem file +driver information. These support routines replace the filesystem file operations with those for the device driver, and then proceed to call -the new open() method for the file. This is how opening a device file +the new open() method for the file. This is how opening a device file in the filesystem eventually ends up calling the device driver open() method. @@ -1002,10 +1002,10 @@ struct dentry_operations ------------------------ This describes how a filesystem can overload the standard dentry -operations. Dentries and the dcache are the domain of the VFS and the -individual filesystem implementations. Device drivers have no business -here. These methods may be set to NULL, as they are either optional or -the VFS uses a default. As of kernel 2.6.22, the following members are +operations. Dentries and the dcache are the domain of the VFS and the +individual filesystem implementations. Device drivers have no business +here. These methods may be set to NULL, as they are either optional or +the VFS uses a default. As of kernel 2.6.22, the following members are defined: struct dentry_operations { @@ -1024,10 +1024,10 @@ struct dentry_operations { struct dentry *(*d_real)(struct dentry *, const struct inode *); }; - d_revalidate: called when the VFS needs to revalidate a dentry. This + d_revalidate: called when the VFS needs to revalidate a dentry. This is called whenever a name look-up finds a dentry in the - dcache. Most local filesystems leave this as NULL, because all their - dentries in the dcache are valid. Network filesystems are different + dcache. Most local filesystems leave this as NULL, because all their + dentries in the dcache are valid. Network filesystems are different since things can change on the server without the client necessarily being aware of it. @@ -1045,11 +1045,11 @@ struct dentry_operations { d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry. This is called when a path-walk ends at dentry that was not acquired by - doing a lookup in the parent directory. This includes "/", "." and "..", + doing a lookup in the parent directory. This includes "/", "." and "..", as well as procfs-style symlinks and mountpoint traversal. In this case, we are less concerned with whether the dentry is still - fully correct, but rather that the inode is still valid. As with + fully correct, but rather that the inode is still valid. As with d_revalidate, most local filesystems will set this to NULL since their dcache entries are always valid. @@ -1057,17 +1057,17 @@ struct dentry_operations { d_weak_revalidate is only called after leaving rcu-walk mode. - d_hash: called when the VFS adds a dentry to the hash table. The first + d_hash: called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is to be hashed into. Same locking and synchronisation rules as d_compare regarding what is safe to dereference etc. - d_compare: called to compare a dentry name with a given name. The first + d_compare: called to compare a dentry name with a given name. The first dentry is the parent of the dentry to be compared, the second is - the child dentry. len and name string are properties of the dentry - to be compared. qstr is the name to compare it with. + the child dentry. len and name string are properties of the dentry + to be compared. qstr is the name to compare it with. Must be constant and idempotent, and should not take locks if possible, and should not or store into the dentry. @@ -1082,9 +1082,9 @@ struct dentry_operations { "rcu-walk", ie. without any locks or references on things. d_delete: called when the last reference to a dentry is dropped and the - dcache is deciding whether or not to cache it. Return 1 to delete - immediately, or 0 to cache the dentry. Default is NULL which means to - always cache a reachable dentry. d_delete must be constant and + dcache is deciding whether or not to cache it. Return 1 to delete + immediately, or 0 to cache the dentry. Default is NULL which means to + always cache a reachable dentry. d_delete must be constant and idempotent. d_init: called when a dentry is allocated @@ -1092,19 +1092,19 @@ struct dentry_operations { d_release: called when a dentry is really deallocated d_iput: called when a dentry loses its inode (just prior to its - being deallocated). The default when this is NULL is that the - VFS calls iput(). If you define this method, you must call + being deallocated). The default when this is NULL is that the + VFS calls iput(). If you define this method, you must call iput() yourself d_dname: called when the pathname of a dentry should be generated. Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay - pathname generation. (Instead of doing it when dentry is created, - it's done only when the path is needed.). Real filesystems probably + pathname generation. (Instead of doing it when dentry is created, + it's done only when the path is needed.). Real filesystems probably dont want to use it, because their dentries are present in global - dcache hash, so their hash should be an invariant. As no lock is + dcache hash, so their hash should be an invariant. As no lock is held, d_dname() should not try to modify the dentry itself, unless - appropriate SMP safety is used. CAUTION : d_path() logic is quite - tricky. The correct way to return for example "Hello" is to put it + appropriate SMP safety is used. CAUTION : d_path() logic is quite + tricky. The correct way to return for example "Hello" is to put it at the end of the buffer, and returns a pointer to the first char. dynamic_dname() helper function is provided to take care of this. @@ -1166,7 +1166,7 @@ struct dentry_operations { With NULL inode the topmost real underlying dentry is returned. Each dentry has a pointer to its parent dentry, as well as a hash list -of child dentries. Child dentries are basically like files in a +of child dentries. Child dentries are basically like files in a directory. @@ -1179,36 +1179,36 @@ manipulate dentries: dget: open a new handle for an existing dentry (this just increments the usage count) - dput: close a handle for a dentry (decrements the usage count). If + dput: close a handle for a dentry (decrements the usage count). If the usage count drops to 0, and the dentry is still in its parent's hash, the "d_delete" method is called to check whether - it should be cached. If it should not be cached, or if the dentry - is not hashed, it is deleted. Otherwise cached dentries are put + it should be cached. If it should not be cached, or if the dentry + is not hashed, it is deleted. Otherwise cached dentries are put into an LRU list to be reclaimed on memory shortage. - d_drop: this unhashes a dentry from its parents hash list. A + d_drop: this unhashes a dentry from its parents hash list. A subsequent call to dput() will deallocate the dentry if its usage count drops to 0 - d_delete: delete a dentry. If there are no other open references to + d_delete: delete a dentry. If there are no other open references to the dentry then the dentry is turned into a negative dentry - (the d_iput() method is called). If there are other + (the d_iput() method is called). If there are other references, then d_drop() is called instead d_add: add a dentry to its parents hash list and then calls d_instantiate() d_instantiate: add a dentry to the alias hash list for the inode and - updates the "d_inode" member. The "i_count" member in the - inode structure should be set/incremented. If the inode + updates the "d_inode" member. The "i_count" member in the + inode structure should be set/incremented. If the inode pointer is NULL, the dentry is called a "negative - dentry". This function is commonly called when an inode is + dentry". This function is commonly called when an inode is created for an existing negative dentry d_lookup: look up a dentry given its parent and path name component It looks up the child of that given name from the dcache - hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use dput() + hash table. If it is found, the reference count is incremented + and the dentry is returned. The caller must use dput() to free the dentry when it finishes using it. Mount Options -- cgit From 90caa781f6402a08b4e602fab7017baa3cee3a28 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:07 +1000 Subject: docs: filesystems: vfs: Use 72 character column width In preparation for conversion to RST format use the kernels favoured documentation column width. If we are going to do this we might as well do it thoroughly. Just do the paragraphs (not the indented stuff), the rest will be done during indentation fix up patch. This patch is whitespace only, no textual changes. Use 72 character column width for all paragraph sections. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 198 +++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 101 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 6088b925aa7f..1cd0e658137a 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -12,15 +12,14 @@ Introduction ============ -The Virtual File System (also known as the Virtual Filesystem Switch) -is the software layer in the kernel that provides the filesystem -interface to userspace programs. It also provides an abstraction -within the kernel which allows different filesystem implementations to -coexist. +The Virtual File System (also known as the Virtual Filesystem Switch) is +the software layer in the kernel that provides the filesystem interface +to userspace programs. It also provides an abstraction within the +kernel which allows different filesystem implementations to coexist. -VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so -on are called from a process context. Filesystem locking is described -in the document Documentation/filesystems/Locking. +VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on +are called from a process context. Filesystem locking is described in +the document Documentation/filesystems/Locking. Directory Entry Cache (dcache) @@ -34,11 +33,10 @@ translate a pathname (filename) into a specific dentry. Dentries live in RAM and are never saved to disc: they exist only for performance. The dentry cache is meant to be a view into your entire filespace. As -most computers cannot fit all dentries in the RAM at the same time, -some bits of the cache are missing. In order to resolve your pathname -into a dentry, the VFS may have to resort to creating dentries along -the way, and then loading the inode. This is done by looking up the -inode. +most computers cannot fit all dentries in the RAM at the same time, some +bits of the cache are missing. In order to resolve your pathname into a +dentry, the VFS may have to resort to creating dentries along the way, +and then loading the inode. This is done by looking up the inode. The Inode Object @@ -46,33 +44,32 @@ The Inode Object An individual dentry usually has a pointer to an inode. Inodes are filesystem objects such as regular files, directories, FIFOs and other -beasts. They live either on the disc (for block device filesystems) -or in the memory (for pseudo filesystems). Inodes that live on the -disc are copied into the memory when required and changes to the inode -are written back to disc. A single inode can be pointed to by multiple +beasts. They live either on the disc (for block device filesystems) or +in the memory (for pseudo filesystems). Inodes that live on the disc +are copied into the memory when required and changes to the inode are +written back to disc. A single inode can be pointed to by multiple dentries (hard links, for example, do this). To look up an inode requires that the VFS calls the lookup() method of the parent directory inode. This method is installed by the specific -filesystem implementation that the inode lives in. Once the VFS has -the required dentry (and hence the inode), we can do all those boring -things like open(2) the file, or stat(2) it to peek at the inode -data. The stat(2) operation is fairly simple: once the VFS has the -dentry, it peeks at the inode data and passes some of it back to -userspace. +filesystem implementation that the inode lives in. Once the VFS has the +required dentry (and hence the inode), we can do all those boring things +like open(2) the file, or stat(2) it to peek at the inode data. The +stat(2) operation is fairly simple: once the VFS has the dentry, it +peeks at the inode data and passes some of it back to userspace. The File Object --------------- Opening a file requires another operation: allocation of a file -structure (this is the kernel-side implementation of file -descriptors). The freshly allocated file structure is initialized with -a pointer to the dentry and a set of file operation member functions. -These are taken from the inode data. The open() file method is then -called so the specific filesystem implementation can do its work. You -can see that this is another switch performed by the VFS. The file -structure is placed into the file descriptor table for the process. +structure (this is the kernel-side implementation of file descriptors). +The freshly allocated file structure is initialized with a pointer to +the dentry and a set of file operation member functions. These are +taken from the inode data. The open() file method is then called so the +specific filesystem implementation can do its work. You can see that +this is another switch performed by the VFS. The file structure is +placed into the file descriptor table for the process. Reading, writing and closing files (and other assorted VFS operations) is done by using the userspace file descriptor to grab the appropriate @@ -93,11 +90,12 @@ functions: extern int unregister_filesystem(struct file_system_type *); The passed struct file_system_type describes your filesystem. When a -request is made to mount a filesystem onto a directory in your namespace, -the VFS will call the appropriate mount() method for the specific -filesystem. New vfsmount referring to the tree returned by ->mount() -will be attached to the mountpoint, so that when pathname resolution -reaches the mountpoint it will jump into the root of that vfsmount. +request is made to mount a filesystem onto a directory in your +namespace, the VFS will call the appropriate mount() method for the +specific filesystem. New vfsmount referring to the tree returned by +->mount() will be attached to the mountpoint, so that when pathname +resolution reaches the mountpoint it will jump into the root of that +vfsmount. You can see all filesystems that are registered to the kernel in the file /proc/filesystems. @@ -156,21 +154,21 @@ The mount() method must return the root dentry of the tree requested by caller. An active reference to its superblock must be grabbed and the superblock must be locked. On failure it should return ERR_PTR(error). -The arguments match those of mount(2) and their interpretation -depends on filesystem type. E.g. for block filesystems, dev_name is -interpreted as block device name, that device is opened and if it -contains a suitable filesystem image the method creates and initializes -struct super_block accordingly, returning its root dentry to caller. +The arguments match those of mount(2) and their interpretation depends +on filesystem type. E.g. for block filesystems, dev_name is interpreted +as block device name, that device is opened and if it contains a +suitable filesystem image the method creates and initializes struct +super_block accordingly, returning its root dentry to caller. ->mount() may choose to return a subtree of existing filesystem - it doesn't have to create a new one. The main result from the caller's -point of view is a reference to dentry at the root of (sub)tree to -be attached; creation of new superblock is a common side effect. +point of view is a reference to dentry at the root of (sub)tree to be +attached; creation of new superblock is a common side effect. -The most interesting member of the superblock structure that the -mount() method fills in is the "s_op" field. This is a pointer to -a "struct super_operations" which describes the next level of the -filesystem implementation. +The most interesting member of the superblock structure that the mount() +method fills in is the "s_op" field. This is a pointer to a "struct +super_operations" which describes the next level of the filesystem +implementation. Usually, a filesystem uses one of the generic mount() implementations and provides a fill_super() callback instead. The generic variants are: @@ -317,16 +315,16 @@ or bottom half). implementations will cause holdoff problems due to large scan batch sizes. -Whoever sets up the inode is responsible for filling in the "i_op" field. This -is a pointer to a "struct inode_operations" which describes the methods that -can be performed on individual inodes. +Whoever sets up the inode is responsible for filling in the "i_op" +field. This is a pointer to a "struct inode_operations" which describes +the methods that can be performed on individual inodes. struct xattr_handlers --------------------- On filesystems that support extended attributes (xattrs), the s_xattr -superblock field points to a NULL-terminated array of xattr handlers. Extended -attributes are name:value pairs. +superblock field points to a NULL-terminated array of xattr handlers. +Extended attributes are name:value pairs. name: Indicates that the handler matches attributes with the specified name (such as "system.posix_acl_access"); the prefix field must be NULL. @@ -346,9 +344,9 @@ attributes are name:value pairs. attribute. This method is called by the the setxattr(2) and removexattr(2) system calls. -When none of the xattr handlers of a filesystem match the specified attribute -name or when a filesystem doesn't support extended attributes, the various -*xattr(2) system calls return -EOPNOTSUPP. +When none of the xattr handlers of a filesystem match the specified +attribute name or when a filesystem doesn't support extended attributes, +the various *xattr(2) system calls return -EOPNOTSUPP. The Inode Object @@ -360,8 +358,8 @@ An inode object represents an object within the filesystem. struct inode_operations ----------------------- -This describes how the VFS can manipulate an inode in your -filesystem. As of kernel 2.6.22, the following members are defined: +This describes how the VFS can manipulate an inode in your filesystem. +As of kernel 2.6.22, the following members are defined: struct inode_operations { int (*create) (struct inode *,struct dentry *, umode_t, bool); @@ -517,42 +515,40 @@ The Address Space Object ======================== The address space object is used to group and manage pages in the page -cache. It can be used to keep track of the pages in a file (or -anything else) and also track the mapping of sections of the file into -process address spaces. +cache. It can be used to keep track of the pages in a file (or anything +else) and also track the mapping of sections of the file into process +address spaces. There are a number of distinct yet related services that an -address-space can provide. These include communicating memory -pressure, page lookup by address, and keeping track of pages tagged as -Dirty or Writeback. +address-space can provide. These include communicating memory pressure, +page lookup by address, and keeping track of pages tagged as Dirty or +Writeback. The first can be used independently to the others. The VM can try to -either write dirty pages in order to clean them, or release clean -pages in order to reuse them. To do this it can call the ->writepage -method on dirty pages, and ->releasepage on clean pages with -PagePrivate set. Clean pages without PagePrivate and with no external -references will be released without notice being given to the -address_space. +either write dirty pages in order to clean them, or release clean pages +in order to reuse them. To do this it can call the ->writepage method +on dirty pages, and ->releasepage on clean pages with PagePrivate set. +Clean pages without PagePrivate and with no external references will be +released without notice being given to the address_space. To achieve this functionality, pages need to be placed on an LRU with -lru_cache_add and mark_page_active needs to be called whenever the -page is used. +lru_cache_add and mark_page_active needs to be called whenever the page +is used. Pages are normally kept in a radix tree index by ->index. This tree -maintains information about the PG_Dirty and PG_Writeback status of -each page, so that pages with either of these flags can be found -quickly. +maintains information about the PG_Dirty and PG_Writeback status of each +page, so that pages with either of these flags can be found quickly. The Dirty tag is primarily used by mpage_writepages - the default ->writepages method. It uses the tag to find dirty pages to call ->writepage on. If mpage_writepages is not used (i.e. the address -provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is -almost unused. write_inode_now and sync_inode do use it (through +provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost +unused. write_inode_now and sync_inode do use it (through __sync_single_inode) to check if ->writepages has been successful in writing out the whole address_space. -The Writeback tag is used by filemap*wait* and sync_page* functions, -via filemap_fdatawait_range, to wait for all writeback to complete. +The Writeback tag is used by filemap*wait* and sync_page* functions, via +filemap_fdatawait_range, to wait for all writeback to complete. An address_space handler may attach extra information to a page, typically using the 'private' field in the 'struct page'. If such @@ -562,25 +558,24 @@ handler to deal with that data. An address space acts as an intermediate between storage and application. Data is read into the address space a whole page at a -time, and provided to the application either by copying of the page, -or by memory-mapping the page. -Data is written into the address space by the application, and then -written-back to storage typically in whole pages, however the -address_space has finer control of write sizes. +time, and provided to the application either by copying of the page, or +by memory-mapping the page. Data is written into the address space by +the application, and then written-back to storage typically in whole +pages, however the address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage -and writepages to writeback data to storage. +set_page_dirty to write data into the address_space, and writepage and +writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the inode's i_mutex. When data is written to a page, the PG_Dirty flag should be set. It typically remains set until writepage asks for it to be written. This -should clear PG_Dirty and set PG_Writeback. It can be actually -written at any point after PG_Dirty is clear. Once it is known to be -safe, PG_Writeback is cleared. +should clear PG_Dirty and set PG_Writeback. It can be actually written +at any point after PG_Dirty is clear. Once it is known to be safe, +PG_Writeback is cleared. Writeback makes use of a writeback_control structure to direct the operations. This gives the the writepage and writepages operations some @@ -609,9 +604,10 @@ file descriptors should get back an error is not possible. Instead, the generic writeback error tracking infrastructure in the kernel settles for reporting errors to fsync on all file descriptions that were open at the time that the error occurred. In a situation with -multiple writers, all of them will get back an error on a subsequent fsync, -even if all of the writes done through that particular file descriptor -succeeded (or even if there were no writes on that file descriptor at all). +multiple writers, all of them will get back an error on a subsequent +fsync, even if all of the writes done through that particular file +descriptor succeeded (or even if there were no writes on that file +descriptor at all). Filesystems that wish to use this infrastructure should call mapping_set_error to record the error in the address_space when it @@ -623,8 +619,8 @@ point in the stream of errors emitted by the backing device(s). struct address_space_operations ------------------------------- -This describes how the VFS can manipulate mapping of a file to page cache in -your filesystem. The following members are defined: +This describes how the VFS can manipulate mapping of a file to page +cache in your filesystem. The following members are defined: struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); @@ -1231,8 +1227,8 @@ filesystems. Showing options --------------- -If a filesystem accepts mount options, it must define show_options() -to show all the currently active options. The rules are: +If a filesystem accepts mount options, it must define show_options() to +show all the currently active options. The rules are: - options MUST be shown which are not default or their values differ from the default @@ -1240,14 +1236,14 @@ to show all the currently active options. The rules are: - options MAY be shown which are enabled by default or have their default value -Options used only internally between a mount helper and the kernel -(such as file descriptors), or which only have an effect during the -mounting (such as ones controlling the creation of a journal) are exempt -from the above rules. +Options used only internally between a mount helper and the kernel (such +as file descriptors), or which only have an effect during the mounting +(such as ones controlling the creation of a journal) are exempt from the +above rules. -The underlying reason for the above rules is to make sure, that a -mount can be accurately replicated (e.g. umounting and mounting again) -based on the information found in /proc/mounts. +The underlying reason for the above rules is to make sure, that a mount +can be accurately replicated (e.g. umounting and mounting again) based +on the information found in /proc/mounts. Resources ========= -- cgit From e04c83cd53b59e422157c4cea0cdc4e2f33fe305 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:08 +1000 Subject: docs: filesystems: vfs: Use uniform spacing around headings Currently spacing before and after headings is non-uniform. Use two blank lines before a heading and one after the heading. Use uniform spacing around headings. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 1cd0e658137a..242fd644c97b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -319,6 +319,7 @@ Whoever sets up the inode is responsible for filling in the "i_op" field. This is a pointer to a "struct inode_operations" which describes the methods that can be performed on individual inodes. + struct xattr_handlers --------------------- @@ -511,6 +512,7 @@ otherwise noted. tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to atomically creating, opening and unlinking a file in given directory. + The Address Space Object ======================== @@ -584,8 +586,10 @@ and the constraints under which it is being done. It is also used to return information back to the caller about the result of a writepage or writepages request. + Handling errors during writeback -------------------------------- + Most applications that do buffered I/O will periodically call a file synchronization call (fsync, fdatasync, msync or sync_file_range) to ensure that data written has made it to the backing store. When there @@ -616,6 +620,7 @@ file->fsync operation, they should call file_check_and_advance_wb_err to ensure that the struct file's error cursor has advanced to the correct point in the stream of errors emitted by the backing device(s). + struct address_space_operations ------------------------------- @@ -1207,9 +1212,11 @@ manipulate dentries: and the dentry is returned. The caller must use dput() to free the dentry when it finishes using it. + Mount Options ============= + Parsing options --------------- @@ -1224,6 +1231,7 @@ The header defines an API that helps parse these options. There are plenty of examples on how to use it in existing filesystems. + Showing options --------------- @@ -1245,6 +1253,7 @@ The underlying reason for the above rules is to make sure, that a mount can be accurately replicated (e.g. umounting and mounting again) based on the information found in /proc/mounts. + Resources ========= -- cgit From 90ac11a844f8859d5f960fb530190a9690a9a19b Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:09 +1000 Subject: docs: filesystems: vfs: Use correct initial heading Kernel RST has a preferred heading adornment scheme. Currently all the heading adornments follow this scheme except the document heading. Use correct heading adornment for initial heading. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 242fd644c97b..1167dd94d84b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -1,5 +1,6 @@ - - Overview of the Linux Virtual File System +========================================= +Overview of the Linux Virtual File System +========================================= Original author: Richard Gooch -- cgit From 099c5c7a3fba0c4686090075c6d214355aa67e47 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:10 +1000 Subject: docs: filesystems: vfs: Use SPDX identifier Currently the licence is indicated via a custom string. We have SPDX license identifiers now for this task. Use SPDX license identifier matching current license string. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 1167dd94d84b..bd6dd782e8ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ========================================= Overview of the Linux Virtual File System ========================================= @@ -7,8 +9,6 @@ Overview of the Linux Virtual File System Copyright (C) 1999 Richard Gooch Copyright (C) 2005 Pekka Enberg - This file is released under the GPLv2. - Introduction ============ -- cgit From e66b045715457ca6e18fce2b2fc61dd8af2e2440 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:11 +1000 Subject: docs: filesystems: vfs: Fix pre-amble indentation Currently file pre-amble contains custom indentation. RST is not going to like this, lets left-align the text. Put the copyright notices in a list in preparation for converting document to RST. Tested-by: Randy Dunlap Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index bd6dd782e8ca..9ed5c8d6e656 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -4,10 +4,10 @@ Overview of the Linux Virtual File System ========================================= - Original author: Richard Gooch +Original author: Richard Gooch - Copyright (C) 1999 Richard Gooch - Copyright (C) 2005 Pekka Enberg +- Copyright (C) 1999 Richard Gooch +- Copyright (C) 2005 Pekka Enberg Introduction -- cgit From 1b44ae63deae020e172866871bd14a76376e0f8b Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:12 +1000 Subject: docs: filesystems: vfs: Convert spaces to tabs There are bunch of places with 8 spaces, in preparation for correctly indenting all code snippets (during conversion to RST) change these to use tabspaces. This patch is whitespace only. Convert instances of 8 consecutive spaces to a single tabspace. Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.txt | 124 +++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 62 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 9ed5c8d6e656..4f4f4931bfa0 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -111,12 +111,12 @@ members are defined: struct file_system_type { const char *name; int fs_flags; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); - void (*kill_sb) (struct super_block *); - struct module *owner; - struct file_system_type * next; - struct list_head fs_supers; + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; }; @@ -205,26 +205,26 @@ This describes how the VFS can manipulate the superblock of your filesystem. As of kernel 2.6.22, the following members are defined: struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, int); - void (*drop_inode) (struct inode *); - void (*delete_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_fs) (struct super_block *); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*dirty_inode) (struct inode *, int flags); + int (*write_inode) (struct inode *, int); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); + int (*statfs) (struct dentry *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + int (*show_options)(struct seq_file *, struct dentry *); + + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); int (*nr_cached_objects)(struct super_block *); void (*free_cached_objects)(struct super_block *, int); }; @@ -479,7 +479,7 @@ otherwise noted. filesystem. May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or + mode, the filesystem must check the permission without blocking or storing to the inode. If a situation is encountered that rcu-walk cannot handle, return @@ -698,12 +698,12 @@ struct address_space_operations { tagged as DIRTY and will pass them to ->writepage. set_page_dirty: called by the VM to set a page dirty. - This is particularly needed if an address space attaches - private data to a page, and that data needs to be updated when - a page is dirtied. This is called, for example, when a memory + This is particularly needed if an address space attaches + private data to a page, and that data needs to be updated when + a page is dirtied. This is called, for example, when a memory mapped page gets modified. If defined, it should set the PageDirty flag, and the - PAGECACHE_TAG_DIRTY tag in the radix tree. + PAGECACHE_TAG_DIRTY tag in the radix tree. readpages: called by the VM to read pages associated with the address_space object. This is essentially just a vector version of @@ -721,7 +721,7 @@ struct address_space_operations { storage, then those blocks should be pre-read (if they haven't been read already) so that the updated blocks can be written out properly. - The filesystem must return the locked pagecache page for the specified + The filesystem must return the locked pagecache page for the specified offset, in *pagep, for the caller to write into. It must be able to cope with short writes (where the length passed to @@ -730,21 +730,21 @@ struct address_space_operations { flags is a field for AOP_FLAG_xxx flags, described in include/linux/fs.h. - A void * may be returned in fsdata, which then gets passed into - write_end. + A void * may be returned in fsdata, which then gets passed into + write_end. - Returns 0 on success; < 0 on failure (which is the error code), in + Returns 0 on success; < 0 on failure (which is the error code), in which case write_end is not called. write_end: After a successful write_begin, and data copy, write_end must - be called. len is the original len passed to write_begin, and copied - is the amount that was able to be copied. + be called. len is the original len passed to write_begin, and copied + is the amount that was able to be copied. - The filesystem must take care of unlocking the page and releasing it - refcount, and updating i_size. + The filesystem must take care of unlocking the page and releasing it + refcount, and updating i_size. - Returns < 0 on failure, otherwise the number of bytes (<= 'copied') - that were able to be copied into pagecache. + Returns < 0 on failure, otherwise the number of bytes (<= 'copied') + that were able to be copied into pagecache. bmap: called by the VFS to map a logical block offset within object to physical block number. This method is used by the FIBMAP @@ -755,7 +755,7 @@ struct address_space_operations { are and uses those addresses directly. invalidatepage: If a page has PagePrivate set, then invalidatepage - will be called when part or all of the page is to be removed + will be called when part or all of the page is to be removed from the address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' @@ -767,47 +767,47 @@ struct address_space_operations { release MUST succeed. releasepage: releasepage is called on PagePrivate pages to indicate - that the page should be freed if possible. ->releasepage - should remove any private data from the page and clear the - PagePrivate flag. If releasepage() fails for some reason, it must + that the page should be freed if possible. ->releasepage + should remove any private data from the page and clear the + PagePrivate flag. If releasepage() fails for some reason, it must indicate failure with a 0 return value. releasepage() is used in two distinct though related cases. The first is when the VM finds a clean page with no active users and - wants to make it a free page. If ->releasepage succeeds, the - page will be removed from the address_space and become free. + wants to make it a free page. If ->releasepage succeeds, the + page will be removed from the address_space and become free. The second case is when a request has been made to invalidate - some or all pages in an address_space. This can happen - through the fadvise(POSIX_FADV_DONTNEED) system call or by the - filesystem explicitly requesting it as nfs and 9fs do (when - they believe the cache may be out of date with storage) by - calling invalidate_inode_pages2(). + some or all pages in an address_space. This can happen + through the fadvise(POSIX_FADV_DONTNEED) system call or by the + filesystem explicitly requesting it as nfs and 9fs do (when + they believe the cache may be out of date with storage) by + calling invalidate_inode_pages2(). If the filesystem makes such a call, and needs to be certain - that all pages are invalidated, then its releasepage will - need to ensure this. Possibly it can clear the PageUptodate - bit if it cannot free private data yet. + that all pages are invalidated, then its releasepage will + need to ensure this. Possibly it can clear the PageUptodate + bit if it cannot free private data yet. freepage: freepage is called once the page is no longer visible in - the page cache in order to allow the cleanup of any private + the page cache in order to allow the cleanup of any private data. Since it may be called by the memory reclaimer, it should not assume that the original address_space mapping still exists, and it should not block. direct_IO: called by the generic read/write routines to perform - direct_IO - that is IO requests which bypass the page cache - and transfer data directly between the storage and the - application's address space. + direct_IO - that is IO requests which bypass the page cache + and transfer data directly between the storage and the + application's address space. isolate_page: Called by the VM when isolating a movable non-lru page. If page is successfully isolated, VM marks the page as PG_isolated via __SetPageIsolated. migrate_page: This is used to compact the physical memory usage. - If the VM wants to relocate a page (maybe off a memory card - that is signalling imminent failure) it will pass a new page + If the VM wants to relocate a page (maybe off a memory card + that is signalling imminent failure) it will pass a new page and an old page to this function. migrate_page should transfer any private data across and update any references - that it has to the page. + that it has to the page. putback_page: Called by the VM when isolated page's migration fails. -- cgit From af96c1e304f7051bf2ee64c9957724bdace05c58 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 15 May 2019 10:29:13 +1000 Subject: docs: filesystems: vfs: Convert vfs.txt to RST vfs.txt is currently stale. If we convert it to RST this is a good first step in the process of getting the VFS documentation up to date. This patch does the following (all as a single patch so as not to introduce any new SPHINX build warnings) - Use '.. code-block:: c' for C code blocks and indent the code blocks. - Use double backticks for struct member descriptions. - Fix a couple of build warnings by guarding pointers (*) with double backticks .e.g ``*ptr``. - Add vfs to Documentation/filesystems/index.rst The member descriptions paragraph indentation was not touched. It is not pretty but these do not cause build warnings. These descriptions all need updating anyways so leave it as it is for now. Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/vfs.rst | 1291 +++++++++++++++++++++++++++++++++++ Documentation/filesystems/vfs.txt | 1274 ---------------------------------- 3 files changed, 1292 insertions(+), 1274 deletions(-) create mode 100644 Documentation/filesystems/vfs.rst delete mode 100644 Documentation/filesystems/vfs.txt (limited to 'Documentation') diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 1131c34d77f6..35644840a690 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -16,6 +16,7 @@ algorithms work. .. toctree:: :maxdepth: 2 + vfs path-lookup.rst api-summary splice diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst new file mode 100644 index 000000000000..2ffbdf5f392c --- /dev/null +++ b/Documentation/filesystems/vfs.rst @@ -0,0 +1,1291 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Overview of the Linux Virtual File System +========================================= + +Original author: Richard Gooch + +- Copyright (C) 1999 Richard Gooch +- Copyright (C) 2005 Pekka Enberg + + +Introduction +============ + +The Virtual File System (also known as the Virtual Filesystem Switch) is +the software layer in the kernel that provides the filesystem interface +to userspace programs. It also provides an abstraction within the +kernel which allows different filesystem implementations to coexist. + +VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on +are called from a process context. Filesystem locking is described in +the document Documentation/filesystems/Locking. + + +Directory Entry Cache (dcache) +------------------------------ + +The VFS implements the open(2), stat(2), chmod(2), and similar system +calls. The pathname argument that is passed to them is used by the VFS +to search through the directory entry cache (also known as the dentry +cache or dcache). This provides a very fast look-up mechanism to +translate a pathname (filename) into a specific dentry. Dentries live +in RAM and are never saved to disc: they exist only for performance. + +The dentry cache is meant to be a view into your entire filespace. As +most computers cannot fit all dentries in the RAM at the same time, some +bits of the cache are missing. In order to resolve your pathname into a +dentry, the VFS may have to resort to creating dentries along the way, +and then loading the inode. This is done by looking up the inode. + + +The Inode Object +---------------- + +An individual dentry usually has a pointer to an inode. Inodes are +filesystem objects such as regular files, directories, FIFOs and other +beasts. They live either on the disc (for block device filesystems) or +in the memory (for pseudo filesystems). Inodes that live on the disc +are copied into the memory when required and changes to the inode are +written back to disc. A single inode can be pointed to by multiple +dentries (hard links, for example, do this). + +To look up an inode requires that the VFS calls the lookup() method of +the parent directory inode. This method is installed by the specific +filesystem implementation that the inode lives in. Once the VFS has the +required dentry (and hence the inode), we can do all those boring things +like open(2) the file, or stat(2) it to peek at the inode data. The +stat(2) operation is fairly simple: once the VFS has the dentry, it +peeks at the inode data and passes some of it back to userspace. + + +The File Object +--------------- + +Opening a file requires another operation: allocation of a file +structure (this is the kernel-side implementation of file descriptors). +The freshly allocated file structure is initialized with a pointer to +the dentry and a set of file operation member functions. These are +taken from the inode data. The open() file method is then called so the +specific filesystem implementation can do its work. You can see that +this is another switch performed by the VFS. The file structure is +placed into the file descriptor table for the process. + +Reading, writing and closing files (and other assorted VFS operations) +is done by using the userspace file descriptor to grab the appropriate +file structure, and then calling the required file structure method to +do whatever is required. For as long as the file is open, it keeps the +dentry in use, which in turn means that the VFS inode is still in use. + + +Registering and Mounting a Filesystem +===================================== + +To register and unregister a filesystem, use the following API +functions: + +.. code-block:: c + + #include + + extern int register_filesystem(struct file_system_type *); + extern int unregister_filesystem(struct file_system_type *); + +The passed struct file_system_type describes your filesystem. When a +request is made to mount a filesystem onto a directory in your +namespace, the VFS will call the appropriate mount() method for the +specific filesystem. New vfsmount referring to the tree returned by +->mount() will be attached to the mountpoint, so that when pathname +resolution reaches the mountpoint it will jump into the root of that +vfsmount. + +You can see all filesystems that are registered to the kernel in the +file /proc/filesystems. + + +struct file_system_type +----------------------- + +This describes the filesystem. As of kernel 2.6.39, the following +members are defined: + +.. code-block:: c + + struct file_system_operations { + const char *name; + int fs_flags; + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; + struct lock_class_key s_lock_key; + struct lock_class_key s_umount_key; + }; + +``name``: the name of the filesystem type, such as "ext2", "iso9660", + "msdos" and so on + +``fs_flags``: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) + +``mount``: the method to call when a new instance of this filesystem should +be mounted + +``kill_sb``: the method to call when an instance of this filesystem + should be shut down + +``owner``: for internal VFS use: you should initialize this to THIS_MODULE in + most cases. + +``next``: for internal VFS use: you should initialize this to NULL + + s_lock_key, s_umount_key: lockdep-specific + +The mount() method has the following arguments: + +``struct file_system_type *fs_type``: describes the filesystem, partly initialized + by the specific filesystem code + +``int flags``: mount flags + +``const char *dev_name``: the device name we are mounting. + +``void *data``: arbitrary mount options, usually comes as an ASCII + string (see "Mount Options" section) + +The mount() method must return the root dentry of the tree requested by +caller. An active reference to its superblock must be grabbed and the +superblock must be locked. On failure it should return ERR_PTR(error). + +The arguments match those of mount(2) and their interpretation depends +on filesystem type. E.g. for block filesystems, dev_name is interpreted +as block device name, that device is opened and if it contains a +suitable filesystem image the method creates and initializes struct +super_block accordingly, returning its root dentry to caller. + +->mount() may choose to return a subtree of existing filesystem - it +doesn't have to create a new one. The main result from the caller's +point of view is a reference to dentry at the root of (sub)tree to be +attached; creation of new superblock is a common side effect. + +The most interesting member of the superblock structure that the mount() +method fills in is the "s_op" field. This is a pointer to a "struct +super_operations" which describes the next level of the filesystem +implementation. + +Usually, a filesystem uses one of the generic mount() implementations +and provides a fill_super() callback instead. The generic variants are: + +``mount_bdev``: mount a filesystem residing on a block device + +``mount_nodev``: mount a filesystem that is not backed by a device + +``mount_single``: mount a filesystem which shares the instance between + all mounts + +A fill_super() callback implementation has the following arguments: + +``struct super_block *sb``: the superblock structure. The callback + must initialize this properly. + +``void *data``: arbitrary mount options, usually comes as an ASCII + string (see "Mount Options" section) + +``int silent``: whether or not to be silent on error + + +The Superblock Object +===================== + +A superblock object represents a mounted filesystem. + + +struct super_operations +----------------------- + +This describes how the VFS can manipulate the superblock of your +filesystem. As of kernel 2.6.22, the following members are defined: + +.. code-block:: c + + struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*dirty_inode) (struct inode *, int flags); + int (*write_inode) (struct inode *, int); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); + int (*statfs) (struct dentry *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + int (*show_options)(struct seq_file *, struct dentry *); + + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); + }; + +All methods are called without any locks being held, unless otherwise +noted. This means that most methods can block safely. All methods are +only called from a process context (i.e. not from an interrupt handler +or bottom half). + +``alloc_inode``: this method is called by alloc_inode() to allocate memory + for struct inode and initialize it. If this function is not + defined, a simple 'struct inode' is allocated. Normally + alloc_inode will be used to allocate a larger structure which + contains a 'struct inode' embedded within it. + +``destroy_inode``: this method is called by destroy_inode() to release + resources allocated for struct inode. It is only required if + ->alloc_inode was defined and simply undoes anything done by + ->alloc_inode. + +``dirty_inode``: this method is called by the VFS to mark an inode dirty. + +``write_inode``: this method is called when the VFS needs to write an + inode to disc. The second parameter indicates whether the write + should be synchronous or not, not all filesystems check this flag. + +``drop_inode``: called when the last access to the inode is dropped, + with the inode->i_lock spinlock held. + + This method should be either NULL (normal UNIX filesystem + semantics) or "generic_delete_inode" (for filesystems that do not + want to cache inodes - causing "delete_inode" to always be + called regardless of the value of i_nlink) + + The "generic_delete_inode()" behavior is equivalent to the + old practice of using "force_delete" in the put_inode() case, + but does not have the races that the "force_delete()" approach + had. + +``delete_inode``: called when the VFS wants to delete an inode + +``put_super``: called when the VFS wishes to free the superblock + (i.e. unmount). This is called with the superblock lock held + +``sync_fs``: called when VFS is writing out all dirty data associated with + a superblock. The second parameter indicates whether the method + should wait until the write out has been completed. Optional. + +``freeze_fs``: called when VFS is locking a filesystem and + forcing it into a consistent state. This method is currently + used by the Logical Volume Manager (LVM). + +``unfreeze_fs``: called when VFS is unlocking a filesystem and making it writable + again. + +``statfs``: called when the VFS needs to get filesystem statistics. + +``remount_fs``: called when the filesystem is remounted. This is called + with the kernel lock held + +``clear_inode``: called then the VFS clears the inode. Optional + +``umount_begin``: called when the VFS is unmounting a filesystem. + +``show_options``: called by the VFS to show mount options for + /proc//mounts. (see "Mount Options" section) + +``quota_read``: called by the VFS to read from filesystem quota file. + +``quota_write``: called by the VFS to write to filesystem quota file. + +``nr_cached_objects``: called by the sb cache shrinking function for the + filesystem to return the number of freeable cached objects it contains. + Optional. + +``free_cache_objects``: called by the sb cache shrinking function for the + filesystem to scan the number of objects indicated to try to free them. + Optional, but any filesystem implementing this method needs to also + implement ->nr_cached_objects for it to be called correctly. + + We can't do anything with any errors that the filesystem might + encountered, hence the void return type. This will never be called if + the VM is trying to reclaim under GFP_NOFS conditions, hence this + method does not need to handle that situation itself. + + Implementations must include conditional reschedule calls inside any + scanning loop that is done. This allows the VFS to determine + appropriate scan batch sizes without having to worry about whether + implementations will cause holdoff problems due to large scan batch + sizes. + +Whoever sets up the inode is responsible for filling in the "i_op" +field. This is a pointer to a "struct inode_operations" which describes +the methods that can be performed on individual inodes. + + +struct xattr_handlers +--------------------- + +On filesystems that support extended attributes (xattrs), the s_xattr +superblock field points to a NULL-terminated array of xattr handlers. +Extended attributes are name:value pairs. + +``name``: Indicates that the handler matches attributes with the specified name + (such as "system.posix_acl_access"); the prefix field must be NULL. + +``prefix``: Indicates that the handler matches all attributes with the specified + name prefix (such as "user."); the name field must be NULL. + +``list``: Determine if attributes matching this xattr handler should be listed + for a particular dentry. Used by some listxattr implementations like + generic_listxattr. + +``get``: Called by the VFS to get the value of a particular extended attribute. + This method is called by the getxattr(2) system call. + +``set``: Called by the VFS to set the value of a particular extended attribute. + When the new value is NULL, called to remove a particular extended + attribute. This method is called by the the setxattr(2) and + removexattr(2) system calls. + +When none of the xattr handlers of a filesystem match the specified +attribute name or when a filesystem doesn't support extended attributes, +the various ``*xattr(2)`` system calls return -EOPNOTSUPP. + + +The Inode Object +================ + +An inode object represents an object within the filesystem. + + +struct inode_operations +----------------------- + +This describes how the VFS can manipulate an inode in your filesystem. +As of kernel 2.6.22, the following members are defined: + +.. code-block:: c + + struct inode_operations { + int (*create) (struct inode *,struct dentry *, umode_t, bool); + struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,umode_t); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); + int (*readlink) (struct dentry *, char __user *,int); + const char *(*get_link) (struct dentry *, struct inode *, + struct delayed_call *); + int (*permission) (struct inode *, int); + int (*get_acl)(struct inode *, int); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + void (*update_time)(struct inode *, struct timespec *, int); + int (*atomic_open)(struct inode *, struct dentry *, struct file *, + unsigned open_flag, umode_t create_mode); + int (*tmpfile) (struct inode *, struct dentry *, umode_t); + }; + +Again, all methods are called without any locks being held, unless +otherwise noted. + +``create``: called by the open(2) and creat(2) system calls. Only + required if you want to support regular files. The dentry you + get should not have an inode (i.e. it should be a negative + dentry). Here you will probably call d_instantiate() with the + dentry and the newly created inode + +``lookup``: called when the VFS needs to look up an inode in a parent + directory. The name to look for is found in the dentry. This + method must call d_add() to insert the found inode into the + dentry. The "i_count" field in the inode structure should be + incremented. If the named inode does not exist a NULL inode + should be inserted into the dentry (this is called a negative + dentry). Returning an error code from this routine must only + be done on a real error, otherwise creating inodes with system + calls like create(2), mknod(2), mkdir(2) and so on will fail. + If you wish to overload the dentry methods then you should + initialise the "d_dop" field in the dentry; this is a pointer + to a struct "dentry_operations". + This method is called with the directory inode semaphore held + +``link``: called by the link(2) system call. Only required if you want + to support hard links. You will probably need to call + d_instantiate() just as you would in the create() method + +``unlink``: called by the unlink(2) system call. Only required if you + want to support deleting inodes + +``symlink``: called by the symlink(2) system call. Only required if you + want to support symlinks. You will probably need to call + d_instantiate() just as you would in the create() method + +``mkdir``: called by the mkdir(2) system call. Only required if you want + to support creating subdirectories. You will probably need to + call d_instantiate() just as you would in the create() method + +``rmdir``: called by the rmdir(2) system call. Only required if you want + to support deleting subdirectories + +``mknod``: called by the mknod(2) system call to create a device (char, + block) inode or a named pipe (FIFO) or socket. Only required + if you want to support creating these types of inodes. You + will probably need to call d_instantiate() just as you would + in the create() method + +``rename``: called by the rename(2) system call to rename the object to + have the parent and name given by the second inode and dentry. + + The filesystem must return -EINVAL for any unsupported or + unknown flags. Currently the following flags are implemented: + (1) RENAME_NOREPLACE: this flag indicates that if the target + of the rename exists the rename should fail with -EEXIST + instead of replacing the target. The VFS already checks for + existence, so for local filesystems the RENAME_NOREPLACE + implementation is equivalent to plain rename. + (2) RENAME_EXCHANGE: exchange source and target. Both must + exist; this is checked by the VFS. Unlike plain rename, + source and target may be of different type. + +``get_link``: called by the VFS to follow a symbolic link to the + inode it points to. Only required if you want to support + symbolic links. This method returns the symlink body + to traverse (and possibly resets the current position with + nd_jump_link()). If the body won't go away until the inode + is gone, nothing else is needed; if it needs to be otherwise + pinned, arrange for its release by having get_link(..., ..., done) + do set_delayed_call(done, destructor, argument). + In that case destructor(argument) will be called once VFS is + done with the body you've returned. + May be called in RCU mode; that is indicated by NULL dentry + argument. If request can't be handled without leaving RCU mode, + have it return ERR_PTR(-ECHILD). + + + If the filesystem stores the symlink target in ->i_link, the + VFS may use it directly without calling ->get_link(); however, + ->get_link() must still be provided. ->i_link must not be + freed until after an RCU grace period. Writing to ->i_link + post-iget() time requires a 'release' memory barrier. + +``readlink``: this is now just an override for use by readlink(2) for the + cases when ->get_link uses nd_jump_link() or object is not in + fact a symlink. Normally filesystems should only implement + ->get_link for symlinks and readlink(2) will automatically use + that. + +``permission``: called by the VFS to check for access rights on a POSIX-like + filesystem. + + May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk + mode, the filesystem must check the permission without blocking or + storing to the inode. + + If a situation is encountered that rcu-walk cannot handle, return + -ECHILD and it will be called again in ref-walk mode. + +``setattr``: called by the VFS to set attributes for a file. This method + is called by chmod(2) and related system calls. + +``getattr``: called by the VFS to get attributes of a file. This method + is called by stat(2) and related system calls. + +``listxattr``: called by the VFS to list all extended attributes for a + given file. This method is called by the listxattr(2) system call. + +``update_time``: called by the VFS to update a specific time or the i_version of + an inode. If this is not defined the VFS will update the inode itself + and call mark_inode_dirty_sync. + +``atomic_open``: called on the last component of an open. Using this optional + method the filesystem can look up, possibly create and open the file in + one atomic operation. If it wants to leave actual opening to the + caller (e.g. if the file turned out to be a symlink, device, or just + something filesystem won't do atomic open for), it may signal this by + returning finish_no_open(file, dentry). This method is only called if + the last component is negative or needs lookup. Cached positive dentries + are still handled by f_op->open(). If the file was created, + FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL + the method must only succeed if the file didn't exist and hence FMODE_CREATED + shall always be set on success. + +``tmpfile``: called in the end of O_TMPFILE open(). Optional, equivalent to + atomically creating, opening and unlinking a file in given directory. + + +The Address Space Object +======================== + +The address space object is used to group and manage pages in the page +cache. It can be used to keep track of the pages in a file (or anything +else) and also track the mapping of sections of the file into process +address spaces. + +There are a number of distinct yet related services that an +address-space can provide. These include communicating memory pressure, +page lookup by address, and keeping track of pages tagged as Dirty or +Writeback. + +The first can be used independently to the others. The VM can try to +either write dirty pages in order to clean them, or release clean pages +in order to reuse them. To do this it can call the ->writepage method +on dirty pages, and ->releasepage on clean pages with PagePrivate set. +Clean pages without PagePrivate and with no external references will be +released without notice being given to the address_space. + +To achieve this functionality, pages need to be placed on an LRU with +lru_cache_add and mark_page_active needs to be called whenever the page +is used. + +Pages are normally kept in a radix tree index by ->index. This tree +maintains information about the PG_Dirty and PG_Writeback status of each +page, so that pages with either of these flags can be found quickly. + +The Dirty tag is primarily used by mpage_writepages - the default +->writepages method. It uses the tag to find dirty pages to call +->writepage on. If mpage_writepages is not used (i.e. the address +provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost +unused. write_inode_now and sync_inode do use it (through +__sync_single_inode) to check if ->writepages has been successful in +writing out the whole address_space. + +The Writeback tag is used by filemap*wait* and sync_page* functions, via +filemap_fdatawait_range, to wait for all writeback to complete. + +An address_space handler may attach extra information to a page, +typically using the 'private' field in the 'struct page'. If such +information is attached, the PG_Private flag should be set. This will +cause various VM routines to make extra calls into the address_space +handler to deal with that data. + +An address space acts as an intermediate between storage and +application. Data is read into the address space a whole page at a +time, and provided to the application either by copying of the page, or +by memory-mapping the page. Data is written into the address space by +the application, and then written-back to storage typically in whole +pages, however the address_space has finer control of write sizes. + +The read process essentially only requires 'readpage'. The write +process is more complicated and uses write_begin/write_end or +set_page_dirty to write data into the address_space, and writepage and +writepages to writeback data to storage. + +Adding and removing pages to/from an address_space is protected by the +inode's i_mutex. + +When data is written to a page, the PG_Dirty flag should be set. It +typically remains set until writepage asks for it to be written. This +should clear PG_Dirty and set PG_Writeback. It can be actually written +at any point after PG_Dirty is clear. Once it is known to be safe, +PG_Writeback is cleared. + +Writeback makes use of a writeback_control structure to direct the +operations. This gives the the writepage and writepages operations some +information about the nature of and reason for the writeback request, +and the constraints under which it is being done. It is also used to +return information back to the caller about the result of a writepage or +writepages request. + + +Handling errors during writeback +-------------------------------- + +Most applications that do buffered I/O will periodically call a file +synchronization call (fsync, fdatasync, msync or sync_file_range) to +ensure that data written has made it to the backing store. When there +is an error during writeback, they expect that error to be reported when +a file sync request is made. After an error has been reported on one +request, subsequent requests on the same file descriptor should return +0, unless further writeback errors have occurred since the previous file +syncronization. + +Ideally, the kernel would report errors only on file descriptions on +which writes were done that subsequently failed to be written back. The +generic pagecache infrastructure does not track the file descriptions +that have dirtied each individual page however, so determining which +file descriptors should get back an error is not possible. + +Instead, the generic writeback error tracking infrastructure in the +kernel settles for reporting errors to fsync on all file descriptions +that were open at the time that the error occurred. In a situation with +multiple writers, all of them will get back an error on a subsequent +fsync, even if all of the writes done through that particular file +descriptor succeeded (or even if there were no writes on that file +descriptor at all). + +Filesystems that wish to use this infrastructure should call +mapping_set_error to record the error in the address_space when it +occurs. Then, after writing back data from the pagecache in their +file->fsync operation, they should call file_check_and_advance_wb_err to +ensure that the struct file's error cursor has advanced to the correct +point in the stream of errors emitted by the backing device(s). + + +struct address_space_operations +------------------------------- + +This describes how the VFS can manipulate mapping of a file to page +cache in your filesystem. The following members are defined: + +.. code-block:: c + + struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + int (*writepages)(struct address_space *, struct writeback_control *); + int (*set_page_dirty)(struct page *page); + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); + int (*releasepage) (struct page *, int); + void (*freepage)(struct page *); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* isolate a page for migration */ + bool (*isolate_page) (struct page *, isolate_mode_t); + /* migrate the contents of a page to the specified target */ + int (*migratepage) (struct page *, struct page *); + /* put migration-failed page back to right list */ + void (*putback_page) (struct page *); + int (*launder_page) (struct page *); + + int (*is_partially_uptodate) (struct page *, unsigned long, + unsigned long); + void (*is_dirty_writeback) (struct page *, bool *, bool *); + int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*swap_activate)(struct file *); + int (*swap_deactivate)(struct file *); + }; + +``writepage``: called by the VM to write a dirty page to backing store. + This may happen for data integrity reasons (i.e. 'sync'), or + to free up memory (flush). The difference can be seen in + wbc->sync_mode. + The PG_Dirty flag has been cleared and PageLocked is true. + writepage should start writeout, should set PG_Writeback, + and should make sure the page is unlocked, either synchronously + or asynchronously when the write operation completes. + + If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to + try too hard if there are problems, and may choose to write out + other pages from the mapping if that is easier (e.g. due to + internal dependencies). If it chooses not to start writeout, it + should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep + calling ->writepage on that page. + + See the file "Locking" for more details. + +``readpage``: called by the VM to read a page from backing store. + The page will be Locked when readpage is called, and should be + unlocked and marked uptodate once the read completes. + If ->readpage discovers that it needs to unlock the page for + some reason, it can do so, and then return AOP_TRUNCATED_PAGE. + In this case, the page will be relocated, relocked and if + that all succeeds, ->readpage will be called again. + +``writepages``: called by the VM to write out pages associated with the + address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then + the writeback_control will specify a range of pages that must be + written out. If it is WBC_SYNC_NONE, then a nr_to_write is given + and that many pages should be written if possible. + If no ->writepages is given, then mpage_writepages is used + instead. This will choose pages from the address space that are + tagged as DIRTY and will pass them to ->writepage. + +``set_page_dirty``: called by the VM to set a page dirty. + This is particularly needed if an address space attaches + private data to a page, and that data needs to be updated when + a page is dirtied. This is called, for example, when a memory + mapped page gets modified. + If defined, it should set the PageDirty flag, and the + PAGECACHE_TAG_DIRTY tag in the radix tree. + +``readpages``: called by the VM to read pages associated with the address_space + object. This is essentially just a vector version of + readpage. Instead of just one page, several pages are + requested. + readpages is only used for read-ahead, so read errors are + ignored. If anything goes wrong, feel free to give up. + +``write_begin``: + Called by the generic buffered write code to ask the filesystem to + prepare to write len bytes at the given offset in the file. The + address_space should check that the write will be able to complete, + by allocating space if necessary and doing any other internal + housekeeping. If the write will update parts of any basic-blocks on + storage, then those blocks should be pre-read (if they haven't been + read already) so that the updated blocks can be written out properly. + + The filesystem must return the locked pagecache page for the specified + offset, in ``*pagep``, for the caller to write into. + + It must be able to cope with short writes (where the length passed to + write_begin is greater than the number of bytes copied into the page). + + flags is a field for AOP_FLAG_xxx flags, described in + include/linux/fs.h. + + A void * may be returned in fsdata, which then gets passed into + write_end. + + Returns 0 on success; < 0 on failure (which is the error code), in + which case write_end is not called. + +``write_end``: After a successful write_begin, and data copy, write_end must + be called. len is the original len passed to write_begin, and copied + is the amount that was able to be copied. + + The filesystem must take care of unlocking the page and releasing it + refcount, and updating i_size. + + Returns < 0 on failure, otherwise the number of bytes (<= 'copied') + that were able to be copied into pagecache. + +``bmap``: called by the VFS to map a logical block offset within object to + physical block number. This method is used by the FIBMAP + ioctl and for working with swap-files. To be able to swap to + a file, the file must have a stable mapping to a block + device. The swap system does not go through the filesystem + but instead uses bmap to find out where the blocks in the file + are and uses those addresses directly. + +``invalidatepage``: If a page has PagePrivate set, then invalidatepage + will be called when part or all of the page is to be removed + from the address space. This generally corresponds to either a + truncation, punch hole or a complete invalidation of the address + space (in the latter case 'offset' will always be 0 and 'length' + will be PAGE_SIZE). Any private data associated with the page + should be updated to reflect this truncation. If offset is 0 and + length is PAGE_SIZE, then the private data should be released, + because the page must be able to be completely discarded. This may + be done by calling the ->releasepage function, but in this case the + release MUST succeed. + +``releasepage``: releasepage is called on PagePrivate pages to indicate + that the page should be freed if possible. ->releasepage + should remove any private data from the page and clear the + PagePrivate flag. If releasepage() fails for some reason, it must + indicate failure with a 0 return value. + releasepage() is used in two distinct though related cases. The + first is when the VM finds a clean page with no active users and + wants to make it a free page. If ->releasepage succeeds, the + page will be removed from the address_space and become free. + + The second case is when a request has been made to invalidate + some or all pages in an address_space. This can happen + through the fadvise(POSIX_FADV_DONTNEED) system call or by the + filesystem explicitly requesting it as nfs and 9fs do (when + they believe the cache may be out of date with storage) by + calling invalidate_inode_pages2(). + If the filesystem makes such a call, and needs to be certain + that all pages are invalidated, then its releasepage will + need to ensure this. Possibly it can clear the PageUptodate + bit if it cannot free private data yet. + +``freepage``: freepage is called once the page is no longer visible in + the page cache in order to allow the cleanup of any private + data. Since it may be called by the memory reclaimer, it + should not assume that the original address_space mapping still + exists, and it should not block. + +``direct_IO``: called by the generic read/write routines to perform + direct_IO - that is IO requests which bypass the page cache + and transfer data directly between the storage and the + application's address space. + +``isolate_page``: Called by the VM when isolating a movable non-lru page. + If page is successfully isolated, VM marks the page as PG_isolated + via __SetPageIsolated. + +``migrate_page``: This is used to compact the physical memory usage. + If the VM wants to relocate a page (maybe off a memory card + that is signalling imminent failure) it will pass a new page + and an old page to this function. migrate_page should + transfer any private data across and update any references + that it has to the page. + +``putback_page``: Called by the VM when isolated page's migration fails. + +``launder_page``: Called before freeing a page - it writes back the dirty page. To + prevent redirtying the page, it is kept locked during the whole + operation. + +``is_partially_uptodate``: Called by the VM when reading a file through the + pagecache when the underlying blocksize != pagesize. If the required + block is up to date then the read can complete without needing the IO + to bring the whole page up to date. + +``is_dirty_writeback``: Called by the VM when attempting to reclaim a page. + The VM uses dirty and writeback information to determine if it needs + to stall to allow flushers a chance to complete some IO. Ordinarily + it can use PageDirty and PageWriteback but some filesystems have + more complex state (unstable pages in NFS prevent reclaim) or + do not set those flags due to locking problems. This callback + allows a filesystem to indicate to the VM if a page should be + treated as dirty or writeback for the purposes of stalling. + +``error_remove_page``: normally set to generic_error_remove_page if truncation + is ok for this address space. Used for memory failure handling. + Setting this implies you deal with pages going away under you, + unless you have them locked or reference counts increased. + +``swap_activate``: Called when swapon is used on a file to allocate + space if necessary and pin the block lookup information in + memory. A return value of zero indicates success, + in which case this file can be used to back swapspace. + +``swap_deactivate``: Called during swapoff on files where swap_activate + was successful. + + +The File Object +=============== + +A file object represents a file opened by a process. This is also known +as an "open file description" in POSIX parlance. + + +struct file_operations +---------------------- + +This describes how the VFS can manipulate an open file. As of kernel +4.18, the following members are defined: + +.. code-block:: c + + struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); + ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); + int (*iopoll)(struct kiocb *kiocb, bool spin); + int (*iterate) (struct file *, struct dir_context *); + int (*iterate_shared) (struct file *, struct dir_context *); + __poll_t (*poll) (struct file *, struct poll_table_struct *); + long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); + long (*compat_ioctl) (struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *, fl_owner_t id); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*check_flags)(int); + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **, void **); + long (*fallocate)(struct file *file, int mode, loff_t offset, + loff_t len); + void (*show_fdinfo)(struct seq_file *m, struct file *f); + #ifndef CONFIG_MMU + unsigned (*mmap_capabilities)(struct file *); + #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + int (*fadvise)(struct file *, loff_t, loff_t, int); + }; + +Again, all methods are called without any locks being held, unless +otherwise noted. + +``llseek``: called when the VFS needs to move the file position index + +``read``: called by read(2) and related system calls + +``read_iter``: possibly asynchronous read with iov_iter as destination + +``write``: called by write(2) and related system calls + +``write_iter``: possibly asynchronous write with iov_iter as source + +``iopoll``: called when aio wants to poll for completions on HIPRI iocbs + +``iterate``: called when the VFS needs to read the directory contents + +``iterate_shared``: called when the VFS needs to read the directory contents + when filesystem supports concurrent dir iterators + +``poll``: called by the VFS when a process wants to check if there is + activity on this file and (optionally) go to sleep until there + is activity. Called by the select(2) and poll(2) system calls + +``unlocked_ioctl``: called by the ioctl(2) system call. + +``compat_ioctl``: called by the ioctl(2) system call when 32 bit system calls + are used on 64 bit kernels. + +``mmap``: called by the mmap(2) system call + +``open``: called by the VFS when an inode should be opened. When the VFS + opens a file, it creates a new "struct file". It then calls the + open method for the newly allocated file structure. You might + think that the open method really belongs in + "struct inode_operations", and you may be right. I think it's + done the way it is because it makes filesystems simpler to + implement. The open() method is a good place to initialize the + "private_data" member in the file structure if you want to point + to a device structure + +``flush``: called by the close(2) system call to flush a file + +``release``: called when the last reference to an open file is closed + +``fsync``: called by the fsync(2) system call. Also see the section above + entitled "Handling errors during writeback". + +``fasync``: called by the fcntl(2) system call when asynchronous + (non-blocking) mode is enabled for a file + +``lock``: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW + commands + +``get_unmapped_area``: called by the mmap(2) system call + +``check_flags``: called by the fcntl(2) system call for F_SETFL command + +``flock``: called by the flock(2) system call + +``splice_write``: called by the VFS to splice data from a pipe to a file. This + method is used by the splice(2) system call + +``splice_read``: called by the VFS to splice data from file to a pipe. This + method is used by the splice(2) system call + +``setlease``: called by the VFS to set or release a file lock lease. setlease + implementations should call generic_setlease to record or remove + the lease in the inode after setting it. + +``fallocate``: called by the VFS to preallocate blocks or punch a hole. + +``copy_file_range``: called by the copy_file_range(2) system call. + +``remap_file_range``: called by the ioctl(2) system call for FICLONERANGE and + FICLONE and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source file into + the dest file at pos_out. Implementations must handle callers passing + in len == 0; this means "remap to the end of the source file". The + return value should the number of bytes remapped, or the usual + negative error code if errors occurred before any bytes were remapped. + The remap_flags parameter accepts REMAP_FILE_* flags. If + REMAP_FILE_DEDUP is set then the implementation must only remap if the + requested file ranges have identical contents. If REMAP_CAN_SHORTEN is + set, the caller is ok with the implementation shortening the request + length to satisfy alignment or EOF requirements (or any other reason). + +``fadvise``: possibly called by the fadvise64() system call. + +Note that the file operations are implemented by the specific +filesystem in which the inode resides. When opening a device node +(character or block special) most filesystems will call special +support routines in the VFS which will locate the required device +driver information. These support routines replace the filesystem file +operations with those for the device driver, and then proceed to call +the new open() method for the file. This is how opening a device file +in the filesystem eventually ends up calling the device driver open() +method. + + +Directory Entry Cache (dcache) +============================== + + +struct dentry_operations +------------------------ + +This describes how a filesystem can overload the standard dentry +operations. Dentries and the dcache are the domain of the VFS and the +individual filesystem implementations. Device drivers have no business +here. These methods may be set to NULL, as they are either optional or +the VFS uses a default. As of kernel 2.6.22, the following members are +defined: + +.. code-block:: c + + struct dentry_operations { + int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_weak_revalidate)(struct dentry *, unsigned int); + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, + unsigned int, const char *, const struct qstr *); + int (*d_delete)(const struct dentry *); + int (*d_init)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); + char *(*d_dname)(struct dentry *, char *, int); + struct vfsmount *(*d_automount)(struct path *); + int (*d_manage)(const struct path *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *); + }; + +``d_revalidate``: called when the VFS needs to revalidate a dentry. This + is called whenever a name look-up finds a dentry in the + dcache. Most local filesystems leave this as NULL, because all their + dentries in the dcache are valid. Network filesystems are different + since things can change on the server without the client necessarily + being aware of it. + + This function should return a positive value if the dentry is still + valid, and zero or a negative error code if it isn't. + + d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). + If in rcu-walk mode, the filesystem must revalidate the dentry without + blocking or storing to the dentry, d_parent and d_inode should not be + used without care (because they can change and, in d_inode case, even + become NULL under us). + + If a situation is encountered that rcu-walk cannot handle, return + -ECHILD and it will be called again in ref-walk mode. + +``_weak_revalidate``: called when the VFS needs to revalidate a "jumped" dentry. + This is called when a path-walk ends at dentry that was not acquired by + doing a lookup in the parent directory. This includes "/", "." and "..", + as well as procfs-style symlinks and mountpoint traversal. + + In this case, we are less concerned with whether the dentry is still + fully correct, but rather that the inode is still valid. As with + d_revalidate, most local filesystems will set this to NULL since their + dcache entries are always valid. + + This function has the same return code semantics as d_revalidate. + + d_weak_revalidate is only called after leaving rcu-walk mode. + +``d_hash``: called when the VFS adds a dentry to the hash table. The first + dentry passed to d_hash is the parent directory that the name is + to be hashed into. + + Same locking and synchronisation rules as d_compare regarding + what is safe to dereference etc. + +``d_compare``: called to compare a dentry name with a given name. The first + dentry is the parent of the dentry to be compared, the second is + the child dentry. len and name string are properties of the dentry + to be compared. qstr is the name to compare it with. + + Must be constant and idempotent, and should not take locks if + possible, and should not or store into the dentry. + Should not dereference pointers outside the dentry without + lots of care (eg. d_parent, d_inode, d_name should not be used). + + However, our vfsmount is pinned, and RCU held, so the dentries and + inodes won't disappear, neither will our sb or filesystem module. + ->d_sb may be used. + + It is a tricky calling convention because it needs to be called under + "rcu-walk", ie. without any locks or references on things. + +``d_delete``: called when the last reference to a dentry is dropped and the + dcache is deciding whether or not to cache it. Return 1 to delete + immediately, or 0 to cache the dentry. Default is NULL which means to + always cache a reachable dentry. d_delete must be constant and + idempotent. + +``d_init``: called when a dentry is allocated + +``d_release``: called when a dentry is really deallocated + +``d_iput``: called when a dentry loses its inode (just prior to its + being deallocated). The default when this is NULL is that the + VFS calls iput(). If you define this method, you must call + iput() yourself + +``d_dname``: called when the pathname of a dentry should be generated. + Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay + pathname generation. (Instead of doing it when dentry is created, + it's done only when the path is needed.). Real filesystems probably + dont want to use it, because their dentries are present in global + dcache hash, so their hash should be an invariant. As no lock is + held, d_dname() should not try to modify the dentry itself, unless + appropriate SMP safety is used. CAUTION : d_path() logic is quite + tricky. The correct way to return for example "Hello" is to put it + at the end of the buffer, and returns a pointer to the first char. + dynamic_dname() helper function is provided to take care of this. + + Example : + +.. code-block:: c + + static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) + { + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + dentry->d_inode->i_ino); + } + +``d_automount``: called when an automount dentry is to be traversed (optional). + This should create a new VFS mount record and return the record to the + caller. The caller is supplied with a path parameter giving the + automount directory to describe the automount target and the parent + VFS mount record to provide inheritable mount parameters. NULL should + be returned if someone else managed to make the automount first. If + the vfsmount creation failed, then an error code should be returned. + If -EISDIR is returned, then the directory will be treated as an + ordinary directory and returned to pathwalk to continue walking. + + If a vfsmount is returned, the caller will attempt to mount it on the + mountpoint and will remove the vfsmount from its expiration list in + the case of failure. The vfsmount should be returned with 2 refs on + it to prevent automatic expiration - the caller will clean up the + additional ref. + + This function is only used if DCACHE_NEED_AUTOMOUNT is set on the + dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the + inode being added. + +``d_manage``: called to allow the filesystem to manage the transition from a + dentry (optional). This allows autofs, for example, to hold up clients + waiting to explore behind a 'mountpoint' while letting the daemon go + past and construct the subtree there. 0 should be returned to let the + calling process continue. -EISDIR can be returned to tell pathwalk to + use this directory as an ordinary directory and to ignore anything + mounted on it and not to check the automount flag. Any other error + code will abort pathwalk completely. + + If the 'rcu_walk' parameter is true, then the caller is doing a + pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, + and the caller can be asked to leave it and call again by returning + -ECHILD. -EISDIR may also be returned to tell pathwalk to + ignore d_automount or any mounts. + + This function is only used if DCACHE_MANAGE_TRANSIT is set on the + dentry being transited from. + +``d_real``: overlay/union type filesystems implement this method to return one of + the underlying dentries hidden by the overlay. It is used in two + different modes: + + Called from file_dentry() it returns the real dentry matching the inode + argument. The real dentry may be from a lower layer already copied up, + but still referenced from the file. This mode is selected with a + non-NULL inode argument. + + With NULL inode the topmost real underlying dentry is returned. + +Each dentry has a pointer to its parent dentry, as well as a hash list +of child dentries. Child dentries are basically like files in a +directory. + + +Directory Entry Cache API +-------------------------- + +There are a number of functions defined which permit a filesystem to +manipulate dentries: + +``dget``: open a new handle for an existing dentry (this just increments + the usage count) + +``dput``: close a handle for a dentry (decrements the usage count). If + the usage count drops to 0, and the dentry is still in its + parent's hash, the "d_delete" method is called to check whether + it should be cached. If it should not be cached, or if the dentry + is not hashed, it is deleted. Otherwise cached dentries are put + into an LRU list to be reclaimed on memory shortage. + +``d_drop``: this unhashes a dentry from its parents hash list. A + subsequent call to dput() will deallocate the dentry if its + usage count drops to 0 + +``d_delete``: delete a dentry. If there are no other open references to + the dentry then the dentry is turned into a negative dentry + (the d_iput() method is called). If there are other + references, then d_drop() is called instead + +``d_add``: add a dentry to its parents hash list and then calls + d_instantiate() + +``d_instantiate``: add a dentry to the alias hash list for the inode and + updates the "d_inode" member. The "i_count" member in the + inode structure should be set/incremented. If the inode + pointer is NULL, the dentry is called a "negative + dentry". This function is commonly called when an inode is + created for an existing negative dentry + +``d_lookup``: look up a dentry given its parent and path name component + It looks up the child of that given name from the dcache + hash table. If it is found, the reference count is incremented + and the dentry is returned. The caller must use dput() + to free the dentry when it finishes using it. + + +Mount Options +============= + + +Parsing options +--------------- + +On mount and remount the filesystem is passed a string containing a +comma separated list of mount options. The options can have either of +these forms: + + option + option=value + +The header defines an API that helps parse these +options. There are plenty of examples on how to use it in existing +filesystems. + + +Showing options +--------------- + +If a filesystem accepts mount options, it must define show_options() to +show all the currently active options. The rules are: + + - options MUST be shown which are not default or their values differ + from the default + + - options MAY be shown which are enabled by default or have their + default value + +Options used only internally between a mount helper and the kernel (such +as file descriptors), or which only have an effect during the mounting +(such as ones controlling the creation of a journal) are exempt from the +above rules. + +The underlying reason for the above rules is to make sure, that a mount +can be accurately replicated (e.g. umounting and mounting again) based +on the information found in /proc/mounts. + + +Resources +========= + +(Note some of these resources are not up-to-date with the latest kernel + version.) + +Creating Linux virtual filesystems. 2002 + + +The Linux Virtual File-system Layer by Neil Brown. 1999 + + +A tour of the Linux VFS by Michael K. Johnson. 1996 + + +A small trail through the Linux kernel by Andries Brouwer. 2001 + diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt deleted file mode 100644 index 4f4f4931bfa0..000000000000 --- a/Documentation/filesystems/vfs.txt +++ /dev/null @@ -1,1274 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================================= -Overview of the Linux Virtual File System -========================================= - -Original author: Richard Gooch - -- Copyright (C) 1999 Richard Gooch -- Copyright (C) 2005 Pekka Enberg - - -Introduction -============ - -The Virtual File System (also known as the Virtual Filesystem Switch) is -the software layer in the kernel that provides the filesystem interface -to userspace programs. It also provides an abstraction within the -kernel which allows different filesystem implementations to coexist. - -VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on -are called from a process context. Filesystem locking is described in -the document Documentation/filesystems/Locking. - - -Directory Entry Cache (dcache) ------------------------------- - -The VFS implements the open(2), stat(2), chmod(2), and similar system -calls. The pathname argument that is passed to them is used by the VFS -to search through the directory entry cache (also known as the dentry -cache or dcache). This provides a very fast look-up mechanism to -translate a pathname (filename) into a specific dentry. Dentries live -in RAM and are never saved to disc: they exist only for performance. - -The dentry cache is meant to be a view into your entire filespace. As -most computers cannot fit all dentries in the RAM at the same time, some -bits of the cache are missing. In order to resolve your pathname into a -dentry, the VFS may have to resort to creating dentries along the way, -and then loading the inode. This is done by looking up the inode. - - -The Inode Object ----------------- - -An individual dentry usually has a pointer to an inode. Inodes are -filesystem objects such as regular files, directories, FIFOs and other -beasts. They live either on the disc (for block device filesystems) or -in the memory (for pseudo filesystems). Inodes that live on the disc -are copied into the memory when required and changes to the inode are -written back to disc. A single inode can be pointed to by multiple -dentries (hard links, for example, do this). - -To look up an inode requires that the VFS calls the lookup() method of -the parent directory inode. This method is installed by the specific -filesystem implementation that the inode lives in. Once the VFS has the -required dentry (and hence the inode), we can do all those boring things -like open(2) the file, or stat(2) it to peek at the inode data. The -stat(2) operation is fairly simple: once the VFS has the dentry, it -peeks at the inode data and passes some of it back to userspace. - - -The File Object ---------------- - -Opening a file requires another operation: allocation of a file -structure (this is the kernel-side implementation of file descriptors). -The freshly allocated file structure is initialized with a pointer to -the dentry and a set of file operation member functions. These are -taken from the inode data. The open() file method is then called so the -specific filesystem implementation can do its work. You can see that -this is another switch performed by the VFS. The file structure is -placed into the file descriptor table for the process. - -Reading, writing and closing files (and other assorted VFS operations) -is done by using the userspace file descriptor to grab the appropriate -file structure, and then calling the required file structure method to -do whatever is required. For as long as the file is open, it keeps the -dentry in use, which in turn means that the VFS inode is still in use. - - -Registering and Mounting a Filesystem -===================================== - -To register and unregister a filesystem, use the following API -functions: - - #include - - extern int register_filesystem(struct file_system_type *); - extern int unregister_filesystem(struct file_system_type *); - -The passed struct file_system_type describes your filesystem. When a -request is made to mount a filesystem onto a directory in your -namespace, the VFS will call the appropriate mount() method for the -specific filesystem. New vfsmount referring to the tree returned by -->mount() will be attached to the mountpoint, so that when pathname -resolution reaches the mountpoint it will jump into the root of that -vfsmount. - -You can see all filesystems that are registered to the kernel in the -file /proc/filesystems. - - -struct file_system_type ------------------------ - -This describes the filesystem. As of kernel 2.6.39, the following -members are defined: - -struct file_system_type { - const char *name; - int fs_flags; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); - void (*kill_sb) (struct super_block *); - struct module *owner; - struct file_system_type * next; - struct list_head fs_supers; - struct lock_class_key s_lock_key; - struct lock_class_key s_umount_key; -}; - - name: the name of the filesystem type, such as "ext2", "iso9660", - "msdos" and so on - - fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) - - mount: the method to call when a new instance of this - filesystem should be mounted - - kill_sb: the method to call when an instance of this filesystem - should be shut down - - owner: for internal VFS use: you should initialize this to THIS_MODULE in - most cases. - - next: for internal VFS use: you should initialize this to NULL - - s_lock_key, s_umount_key: lockdep-specific - -The mount() method has the following arguments: - - struct file_system_type *fs_type: describes the filesystem, partly initialized - by the specific filesystem code - - int flags: mount flags - - const char *dev_name: the device name we are mounting. - - void *data: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) - -The mount() method must return the root dentry of the tree requested by -caller. An active reference to its superblock must be grabbed and the -superblock must be locked. On failure it should return ERR_PTR(error). - -The arguments match those of mount(2) and their interpretation depends -on filesystem type. E.g. for block filesystems, dev_name is interpreted -as block device name, that device is opened and if it contains a -suitable filesystem image the method creates and initializes struct -super_block accordingly, returning its root dentry to caller. - -->mount() may choose to return a subtree of existing filesystem - it -doesn't have to create a new one. The main result from the caller's -point of view is a reference to dentry at the root of (sub)tree to be -attached; creation of new superblock is a common side effect. - -The most interesting member of the superblock structure that the mount() -method fills in is the "s_op" field. This is a pointer to a "struct -super_operations" which describes the next level of the filesystem -implementation. - -Usually, a filesystem uses one of the generic mount() implementations -and provides a fill_super() callback instead. The generic variants are: - - mount_bdev: mount a filesystem residing on a block device - - mount_nodev: mount a filesystem that is not backed by a device - - mount_single: mount a filesystem which shares the instance between - all mounts - -A fill_super() callback implementation has the following arguments: - - struct super_block *sb: the superblock structure. The callback - must initialize this properly. - - void *data: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) - - int silent: whether or not to be silent on error - - -The Superblock Object -===================== - -A superblock object represents a mounted filesystem. - - -struct super_operations ------------------------ - -This describes how the VFS can manipulate the superblock of your -filesystem. As of kernel 2.6.22, the following members are defined: - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, int); - void (*drop_inode) (struct inode *); - void (*delete_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_fs) (struct super_block *); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - int (*nr_cached_objects)(struct super_block *); - void (*free_cached_objects)(struct super_block *, int); -}; - -All methods are called without any locks being held, unless otherwise -noted. This means that most methods can block safely. All methods are -only called from a process context (i.e. not from an interrupt handler -or bottom half). - - alloc_inode: this method is called by alloc_inode() to allocate memory - for struct inode and initialize it. If this function is not - defined, a simple 'struct inode' is allocated. Normally - alloc_inode will be used to allocate a larger structure which - contains a 'struct inode' embedded within it. - - destroy_inode: this method is called by destroy_inode() to release - resources allocated for struct inode. It is only required if - ->alloc_inode was defined and simply undoes anything done by - ->alloc_inode. - - dirty_inode: this method is called by the VFS to mark an inode dirty. - - write_inode: this method is called when the VFS needs to write an - inode to disc. The second parameter indicates whether the write - should be synchronous or not, not all filesystems check this flag. - - drop_inode: called when the last access to the inode is dropped, - with the inode->i_lock spinlock held. - - This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do not - want to cache inodes - causing "delete_inode" to always be - called regardless of the value of i_nlink) - - The "generic_delete_inode()" behavior is equivalent to the - old practice of using "force_delete" in the put_inode() case, - but does not have the races that the "force_delete()" approach - had. - - delete_inode: called when the VFS wants to delete an inode - - put_super: called when the VFS wishes to free the superblock - (i.e. unmount). This is called with the superblock lock held - - sync_fs: called when VFS is writing out all dirty data associated with - a superblock. The second parameter indicates whether the method - should wait until the write out has been completed. Optional. - - freeze_fs: called when VFS is locking a filesystem and - forcing it into a consistent state. This method is currently - used by the Logical Volume Manager (LVM). - - unfreeze_fs: called when VFS is unlocking a filesystem and making it writable - again. - - statfs: called when the VFS needs to get filesystem statistics. - - remount_fs: called when the filesystem is remounted. This is called - with the kernel lock held - - clear_inode: called then the VFS clears the inode. Optional - - umount_begin: called when the VFS is unmounting a filesystem. - - show_options: called by the VFS to show mount options for - /proc//mounts. (see "Mount Options" section) - - quota_read: called by the VFS to read from filesystem quota file. - - quota_write: called by the VFS to write to filesystem quota file. - - nr_cached_objects: called by the sb cache shrinking function for the - filesystem to return the number of freeable cached objects it contains. - Optional. - - free_cache_objects: called by the sb cache shrinking function for the - filesystem to scan the number of objects indicated to try to free them. - Optional, but any filesystem implementing this method needs to also - implement ->nr_cached_objects for it to be called correctly. - - We can't do anything with any errors that the filesystem might - encountered, hence the void return type. This will never be called if - the VM is trying to reclaim under GFP_NOFS conditions, hence this - method does not need to handle that situation itself. - - Implementations must include conditional reschedule calls inside any - scanning loop that is done. This allows the VFS to determine - appropriate scan batch sizes without having to worry about whether - implementations will cause holdoff problems due to large scan batch - sizes. - -Whoever sets up the inode is responsible for filling in the "i_op" -field. This is a pointer to a "struct inode_operations" which describes -the methods that can be performed on individual inodes. - - -struct xattr_handlers ---------------------- - -On filesystems that support extended attributes (xattrs), the s_xattr -superblock field points to a NULL-terminated array of xattr handlers. -Extended attributes are name:value pairs. - - name: Indicates that the handler matches attributes with the specified name - (such as "system.posix_acl_access"); the prefix field must be NULL. - - prefix: Indicates that the handler matches all attributes with the specified - name prefix (such as "user."); the name field must be NULL. - - list: Determine if attributes matching this xattr handler should be listed - for a particular dentry. Used by some listxattr implementations like - generic_listxattr. - - get: Called by the VFS to get the value of a particular extended attribute. - This method is called by the getxattr(2) system call. - - set: Called by the VFS to set the value of a particular extended attribute. - When the new value is NULL, called to remove a particular extended - attribute. This method is called by the the setxattr(2) and - removexattr(2) system calls. - -When none of the xattr handlers of a filesystem match the specified -attribute name or when a filesystem doesn't support extended attributes, -the various *xattr(2) system calls return -EOPNOTSUPP. - - -The Inode Object -================ - -An inode object represents an object within the filesystem. - - -struct inode_operations ------------------------ - -This describes how the VFS can manipulate an inode in your filesystem. -As of kernel 2.6.22, the following members are defined: - -struct inode_operations { - int (*create) (struct inode *,struct dentry *, umode_t, bool); - struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - int (*link) (struct dentry *,struct inode *,struct dentry *); - int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,umode_t); - int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *, unsigned int); - int (*readlink) (struct dentry *, char __user *,int); - const char *(*get_link) (struct dentry *, struct inode *, - struct delayed_call *); - int (*permission) (struct inode *, int); - int (*get_acl)(struct inode *, int); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); - ssize_t (*listxattr) (struct dentry *, char *, size_t); - void (*update_time)(struct inode *, struct timespec *, int); - int (*atomic_open)(struct inode *, struct dentry *, struct file *, - unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct inode *, struct dentry *, umode_t); -}; - -Again, all methods are called without any locks being held, unless -otherwise noted. - - create: called by the open(2) and creat(2) system calls. Only - required if you want to support regular files. The dentry you - get should not have an inode (i.e. it should be a negative - dentry). Here you will probably call d_instantiate() with the - dentry and the newly created inode - - lookup: called when the VFS needs to look up an inode in a parent - directory. The name to look for is found in the dentry. This - method must call d_add() to insert the found inode into the - dentry. The "i_count" field in the inode structure should be - incremented. If the named inode does not exist a NULL inode - should be inserted into the dentry (this is called a negative - dentry). Returning an error code from this routine must only - be done on a real error, otherwise creating inodes with system - calls like create(2), mknod(2), mkdir(2) and so on will fail. - If you wish to overload the dentry methods then you should - initialise the "d_dop" field in the dentry; this is a pointer - to a struct "dentry_operations". - This method is called with the directory inode semaphore held - - link: called by the link(2) system call. Only required if you want - to support hard links. You will probably need to call - d_instantiate() just as you would in the create() method - - unlink: called by the unlink(2) system call. Only required if you - want to support deleting inodes - - symlink: called by the symlink(2) system call. Only required if you - want to support symlinks. You will probably need to call - d_instantiate() just as you would in the create() method - - mkdir: called by the mkdir(2) system call. Only required if you want - to support creating subdirectories. You will probably need to - call d_instantiate() just as you would in the create() method - - rmdir: called by the rmdir(2) system call. Only required if you want - to support deleting subdirectories - - mknod: called by the mknod(2) system call to create a device (char, - block) inode or a named pipe (FIFO) or socket. Only required - if you want to support creating these types of inodes. You - will probably need to call d_instantiate() just as you would - in the create() method - - rename: called by the rename(2) system call to rename the object to - have the parent and name given by the second inode and dentry. - - The filesystem must return -EINVAL for any unsupported or - unknown flags. Currently the following flags are implemented: - (1) RENAME_NOREPLACE: this flag indicates that if the target - of the rename exists the rename should fail with -EEXIST - instead of replacing the target. The VFS already checks for - existence, so for local filesystems the RENAME_NOREPLACE - implementation is equivalent to plain rename. - (2) RENAME_EXCHANGE: exchange source and target. Both must - exist; this is checked by the VFS. Unlike plain rename, - source and target may be of different type. - - get_link: called by the VFS to follow a symbolic link to the - inode it points to. Only required if you want to support - symbolic links. This method returns the symlink body - to traverse (and possibly resets the current position with - nd_jump_link()). If the body won't go away until the inode - is gone, nothing else is needed; if it needs to be otherwise - pinned, arrange for its release by having get_link(..., ..., done) - do set_delayed_call(done, destructor, argument). - In that case destructor(argument) will be called once VFS is - done with the body you've returned. - May be called in RCU mode; that is indicated by NULL dentry - argument. If request can't be handled without leaving RCU mode, - have it return ERR_PTR(-ECHILD). - - If the filesystem stores the symlink target in ->i_link, the - VFS may use it directly without calling ->get_link(); however, - ->get_link() must still be provided. ->i_link must not be - freed until after an RCU grace period. Writing to ->i_link - post-iget() time requires a 'release' memory barrier. - - readlink: this is now just an override for use by readlink(2) for the - cases when ->get_link uses nd_jump_link() or object is not in - fact a symlink. Normally filesystems should only implement - ->get_link for symlinks and readlink(2) will automatically use - that. - - permission: called by the VFS to check for access rights on a POSIX-like - filesystem. - - May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or - storing to the inode. - - If a situation is encountered that rcu-walk cannot handle, return - -ECHILD and it will be called again in ref-walk mode. - - setattr: called by the VFS to set attributes for a file. This method - is called by chmod(2) and related system calls. - - getattr: called by the VFS to get attributes of a file. This method - is called by stat(2) and related system calls. - - listxattr: called by the VFS to list all extended attributes for a - given file. This method is called by the listxattr(2) system call. - - update_time: called by the VFS to update a specific time or the i_version of - an inode. If this is not defined the VFS will update the inode itself - and call mark_inode_dirty_sync. - - atomic_open: called on the last component of an open. Using this optional - method the filesystem can look up, possibly create and open the file in - one atomic operation. If it wants to leave actual opening to the - caller (e.g. if the file turned out to be a symlink, device, or just - something filesystem won't do atomic open for), it may signal this by - returning finish_no_open(file, dentry). This method is only called if - the last component is negative or needs lookup. Cached positive dentries - are still handled by f_op->open(). If the file was created, - FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL - the method must only succeed if the file didn't exist and hence FMODE_CREATED - shall always be set on success. - - tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to - atomically creating, opening and unlinking a file in given directory. - - -The Address Space Object -======================== - -The address space object is used to group and manage pages in the page -cache. It can be used to keep track of the pages in a file (or anything -else) and also track the mapping of sections of the file into process -address spaces. - -There are a number of distinct yet related services that an -address-space can provide. These include communicating memory pressure, -page lookup by address, and keeping track of pages tagged as Dirty or -Writeback. - -The first can be used independently to the others. The VM can try to -either write dirty pages in order to clean them, or release clean pages -in order to reuse them. To do this it can call the ->writepage method -on dirty pages, and ->releasepage on clean pages with PagePrivate set. -Clean pages without PagePrivate and with no external references will be -released without notice being given to the address_space. - -To achieve this functionality, pages need to be placed on an LRU with -lru_cache_add and mark_page_active needs to be called whenever the page -is used. - -Pages are normally kept in a radix tree index by ->index. This tree -maintains information about the PG_Dirty and PG_Writeback status of each -page, so that pages with either of these flags can be found quickly. - -The Dirty tag is primarily used by mpage_writepages - the default -->writepages method. It uses the tag to find dirty pages to call -->writepage on. If mpage_writepages is not used (i.e. the address -provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost -unused. write_inode_now and sync_inode do use it (through -__sync_single_inode) to check if ->writepages has been successful in -writing out the whole address_space. - -The Writeback tag is used by filemap*wait* and sync_page* functions, via -filemap_fdatawait_range, to wait for all writeback to complete. - -An address_space handler may attach extra information to a page, -typically using the 'private' field in the 'struct page'. If such -information is attached, the PG_Private flag should be set. This will -cause various VM routines to make extra calls into the address_space -handler to deal with that data. - -An address space acts as an intermediate between storage and -application. Data is read into the address space a whole page at a -time, and provided to the application either by copying of the page, or -by memory-mapping the page. Data is written into the address space by -the application, and then written-back to storage typically in whole -pages, however the address_space has finer control of write sizes. - -The read process essentially only requires 'readpage'. The write -process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage and -writepages to writeback data to storage. - -Adding and removing pages to/from an address_space is protected by the -inode's i_mutex. - -When data is written to a page, the PG_Dirty flag should be set. It -typically remains set until writepage asks for it to be written. This -should clear PG_Dirty and set PG_Writeback. It can be actually written -at any point after PG_Dirty is clear. Once it is known to be safe, -PG_Writeback is cleared. - -Writeback makes use of a writeback_control structure to direct the -operations. This gives the the writepage and writepages operations some -information about the nature of and reason for the writeback request, -and the constraints under which it is being done. It is also used to -return information back to the caller about the result of a writepage or -writepages request. - - -Handling errors during writeback --------------------------------- - -Most applications that do buffered I/O will periodically call a file -synchronization call (fsync, fdatasync, msync or sync_file_range) to -ensure that data written has made it to the backing store. When there -is an error during writeback, they expect that error to be reported when -a file sync request is made. After an error has been reported on one -request, subsequent requests on the same file descriptor should return -0, unless further writeback errors have occurred since the previous file -syncronization. - -Ideally, the kernel would report errors only on file descriptions on -which writes were done that subsequently failed to be written back. The -generic pagecache infrastructure does not track the file descriptions -that have dirtied each individual page however, so determining which -file descriptors should get back an error is not possible. - -Instead, the generic writeback error tracking infrastructure in the -kernel settles for reporting errors to fsync on all file descriptions -that were open at the time that the error occurred. In a situation with -multiple writers, all of them will get back an error on a subsequent -fsync, even if all of the writes done through that particular file -descriptor succeeded (or even if there were no writes on that file -descriptor at all). - -Filesystems that wish to use this infrastructure should call -mapping_set_error to record the error in the address_space when it -occurs. Then, after writing back data from the pagecache in their -file->fsync operation, they should call file_check_and_advance_wb_err to -ensure that the struct file's error cursor has advanced to the correct -point in the stream of errors emitted by the backing device(s). - - -struct address_space_operations -------------------------------- - -This describes how the VFS can manipulate mapping of a file to page -cache in your filesystem. The following members are defined: - -struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); - int (*readpage)(struct file *, struct page *); - int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); - int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); - sector_t (*bmap)(struct address_space *, sector_t); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); - int (*releasepage) (struct page *, int); - void (*freepage)(struct page *); - ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* isolate a page for migration */ - bool (*isolate_page) (struct page *, isolate_mode_t); - /* migrate the contents of a page to the specified target */ - int (*migratepage) (struct page *, struct page *); - /* put migration-failed page back to right list */ - void (*putback_page) (struct page *); - int (*launder_page) (struct page *); - - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); - void (*is_dirty_writeback) (struct page *, bool *, bool *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); - int (*swap_activate)(struct file *); - int (*swap_deactivate)(struct file *); -}; - - writepage: called by the VM to write a dirty page to backing store. - This may happen for data integrity reasons (i.e. 'sync'), or - to free up memory (flush). The difference can be seen in - wbc->sync_mode. - The PG_Dirty flag has been cleared and PageLocked is true. - writepage should start writeout, should set PG_Writeback, - and should make sure the page is unlocked, either synchronously - or asynchronously when the write operation completes. - - If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to - try too hard if there are problems, and may choose to write out - other pages from the mapping if that is easier (e.g. due to - internal dependencies). If it chooses not to start writeout, it - should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep - calling ->writepage on that page. - - See the file "Locking" for more details. - - readpage: called by the VM to read a page from backing store. - The page will be Locked when readpage is called, and should be - unlocked and marked uptodate once the read completes. - If ->readpage discovers that it needs to unlock the page for - some reason, it can do so, and then return AOP_TRUNCATED_PAGE. - In this case, the page will be relocated, relocked and if - that all succeeds, ->readpage will be called again. - - writepages: called by the VM to write out pages associated with the - address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then - the writeback_control will specify a range of pages that must be - written out. If it is WBC_SYNC_NONE, then a nr_to_write is given - and that many pages should be written if possible. - If no ->writepages is given, then mpage_writepages is used - instead. This will choose pages from the address space that are - tagged as DIRTY and will pass them to ->writepage. - - set_page_dirty: called by the VM to set a page dirty. - This is particularly needed if an address space attaches - private data to a page, and that data needs to be updated when - a page is dirtied. This is called, for example, when a memory - mapped page gets modified. - If defined, it should set the PageDirty flag, and the - PAGECACHE_TAG_DIRTY tag in the radix tree. - - readpages: called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of - readpage. Instead of just one page, several pages are - requested. - readpages is only used for read-ahead, so read errors are - ignored. If anything goes wrong, feel free to give up. - - write_begin: - Called by the generic buffered write code to ask the filesystem to - prepare to write len bytes at the given offset in the file. The - address_space should check that the write will be able to complete, - by allocating space if necessary and doing any other internal - housekeeping. If the write will update parts of any basic-blocks on - storage, then those blocks should be pre-read (if they haven't been - read already) so that the updated blocks can be written out properly. - - The filesystem must return the locked pagecache page for the specified - offset, in *pagep, for the caller to write into. - - It must be able to cope with short writes (where the length passed to - write_begin is greater than the number of bytes copied into the page). - - flags is a field for AOP_FLAG_xxx flags, described in - include/linux/fs.h. - - A void * may be returned in fsdata, which then gets passed into - write_end. - - Returns 0 on success; < 0 on failure (which is the error code), in - which case write_end is not called. - - write_end: After a successful write_begin, and data copy, write_end must - be called. len is the original len passed to write_begin, and copied - is the amount that was able to be copied. - - The filesystem must take care of unlocking the page and releasing it - refcount, and updating i_size. - - Returns < 0 on failure, otherwise the number of bytes (<= 'copied') - that were able to be copied into pagecache. - - bmap: called by the VFS to map a logical block offset within object to - physical block number. This method is used by the FIBMAP - ioctl and for working with swap-files. To be able to swap to - a file, the file must have a stable mapping to a block - device. The swap system does not go through the filesystem - but instead uses bmap to find out where the blocks in the file - are and uses those addresses directly. - - invalidatepage: If a page has PagePrivate set, then invalidatepage - will be called when part or all of the page is to be removed - from the address space. This generally corresponds to either a - truncation, punch hole or a complete invalidation of the address - space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_SIZE). Any private data associated with the page - should be updated to reflect this truncation. If offset is 0 and - length is PAGE_SIZE, then the private data should be released, - because the page must be able to be completely discarded. This may - be done by calling the ->releasepage function, but in this case the - release MUST succeed. - - releasepage: releasepage is called on PagePrivate pages to indicate - that the page should be freed if possible. ->releasepage - should remove any private data from the page and clear the - PagePrivate flag. If releasepage() fails for some reason, it must - indicate failure with a 0 return value. - releasepage() is used in two distinct though related cases. The - first is when the VM finds a clean page with no active users and - wants to make it a free page. If ->releasepage succeeds, the - page will be removed from the address_space and become free. - - The second case is when a request has been made to invalidate - some or all pages in an address_space. This can happen - through the fadvise(POSIX_FADV_DONTNEED) system call or by the - filesystem explicitly requesting it as nfs and 9fs do (when - they believe the cache may be out of date with storage) by - calling invalidate_inode_pages2(). - If the filesystem makes such a call, and needs to be certain - that all pages are invalidated, then its releasepage will - need to ensure this. Possibly it can clear the PageUptodate - bit if it cannot free private data yet. - - freepage: freepage is called once the page is no longer visible in - the page cache in order to allow the cleanup of any private - data. Since it may be called by the memory reclaimer, it - should not assume that the original address_space mapping still - exists, and it should not block. - - direct_IO: called by the generic read/write routines to perform - direct_IO - that is IO requests which bypass the page cache - and transfer data directly between the storage and the - application's address space. - - isolate_page: Called by the VM when isolating a movable non-lru page. - If page is successfully isolated, VM marks the page as PG_isolated - via __SetPageIsolated. - - migrate_page: This is used to compact the physical memory usage. - If the VM wants to relocate a page (maybe off a memory card - that is signalling imminent failure) it will pass a new page - and an old page to this function. migrate_page should - transfer any private data across and update any references - that it has to the page. - - putback_page: Called by the VM when isolated page's migration fails. - - launder_page: Called before freeing a page - it writes back the dirty page. To - prevent redirtying the page, it is kept locked during the whole - operation. - - is_partially_uptodate: Called by the VM when reading a file through the - pagecache when the underlying blocksize != pagesize. If the required - block is up to date then the read can complete without needing the IO - to bring the whole page up to date. - - is_dirty_writeback: Called by the VM when attempting to reclaim a page. - The VM uses dirty and writeback information to determine if it needs - to stall to allow flushers a chance to complete some IO. Ordinarily - it can use PageDirty and PageWriteback but some filesystems have - more complex state (unstable pages in NFS prevent reclaim) or - do not set those flags due to locking problems. This callback - allows a filesystem to indicate to the VM if a page should be - treated as dirty or writeback for the purposes of stalling. - - error_remove_page: normally set to generic_error_remove_page if truncation - is ok for this address space. Used for memory failure handling. - Setting this implies you deal with pages going away under you, - unless you have them locked or reference counts increased. - - swap_activate: Called when swapon is used on a file to allocate - space if necessary and pin the block lookup information in - memory. A return value of zero indicates success, - in which case this file can be used to back swapspace. - - swap_deactivate: Called during swapoff on files where swap_activate - was successful. - - -The File Object -=============== - -A file object represents a file opened by a process. This is also known -as an "open file description" in POSIX parlance. - - -struct file_operations ----------------------- - -This describes how the VFS can manipulate an open file. As of kernel -4.18, the following members are defined: - -struct file_operations { - struct module *owner; - loff_t (*llseek) (struct file *, loff_t, int); - ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); - ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); - ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, bool spin); - int (*iterate) (struct file *, struct dir_context *); - int (*iterate_shared) (struct file *, struct dir_context *); - __poll_t (*poll) (struct file *, struct poll_table_struct *); - long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); - long (*compat_ioctl) (struct file *, unsigned int, unsigned long); - int (*mmap) (struct file *, struct vm_area_struct *); - int (*open) (struct inode *, struct file *); - int (*flush) (struct file *, fl_owner_t id); - int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, loff_t, loff_t, int datasync); - int (*fasync) (int, struct file *, int); - int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - int (*check_flags)(int); - int (*flock) (struct file *, int, struct file_lock *); - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); - int (*setlease)(struct file *, long, struct file_lock **, void **); - long (*fallocate)(struct file *file, int mode, loff_t offset, - loff_t len); - void (*show_fdinfo)(struct seq_file *m, struct file *f); -#ifndef CONFIG_MMU - unsigned (*mmap_capabilities)(struct file *); -#endif - ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); - int (*fadvise)(struct file *, loff_t, loff_t, int); -}; - -Again, all methods are called without any locks being held, unless -otherwise noted. - - llseek: called when the VFS needs to move the file position index - - read: called by read(2) and related system calls - - read_iter: possibly asynchronous read with iov_iter as destination - - write: called by write(2) and related system calls - - write_iter: possibly asynchronous write with iov_iter as source - - iopoll: called when aio wants to poll for completions on HIPRI iocbs - - iterate: called when the VFS needs to read the directory contents - - iterate_shared: called when the VFS needs to read the directory contents - when filesystem supports concurrent dir iterators - - poll: called by the VFS when a process wants to check if there is - activity on this file and (optionally) go to sleep until there - is activity. Called by the select(2) and poll(2) system calls - - unlocked_ioctl: called by the ioctl(2) system call. - - compat_ioctl: called by the ioctl(2) system call when 32 bit system calls - are used on 64 bit kernels. - - mmap: called by the mmap(2) system call - - open: called by the VFS when an inode should be opened. When the VFS - opens a file, it creates a new "struct file". It then calls the - open method for the newly allocated file structure. You might - think that the open method really belongs in - "struct inode_operations", and you may be right. I think it's - done the way it is because it makes filesystems simpler to - implement. The open() method is a good place to initialize the - "private_data" member in the file structure if you want to point - to a device structure - - flush: called by the close(2) system call to flush a file - - release: called when the last reference to an open file is closed - - fsync: called by the fsync(2) system call. Also see the section above - entitled "Handling errors during writeback". - - fasync: called by the fcntl(2) system call when asynchronous - (non-blocking) mode is enabled for a file - - lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW - commands - - get_unmapped_area: called by the mmap(2) system call - - check_flags: called by the fcntl(2) system call for F_SETFL command - - flock: called by the flock(2) system call - - splice_write: called by the VFS to splice data from a pipe to a file. This - method is used by the splice(2) system call - - splice_read: called by the VFS to splice data from file to a pipe. This - method is used by the splice(2) system call - - setlease: called by the VFS to set or release a file lock lease. setlease - implementations should call generic_setlease to record or remove - the lease in the inode after setting it. - - fallocate: called by the VFS to preallocate blocks or punch a hole. - - copy_file_range: called by the copy_file_range(2) system call. - - remap_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE and FIDEDUPERANGE commands to remap file ranges. An - implementation should remap len bytes at pos_in of the source file into - the dest file at pos_out. Implementations must handle callers passing - in len == 0; this means "remap to the end of the source file". The - return value should the number of bytes remapped, or the usual - negative error code if errors occurred before any bytes were remapped. - The remap_flags parameter accepts REMAP_FILE_* flags. If - REMAP_FILE_DEDUP is set then the implementation must only remap if the - requested file ranges have identical contents. If REMAP_CAN_SHORTEN is - set, the caller is ok with the implementation shortening the request - length to satisfy alignment or EOF requirements (or any other reason). - - fadvise: possibly called by the fadvise64() system call. - -Note that the file operations are implemented by the specific -filesystem in which the inode resides. When opening a device node -(character or block special) most filesystems will call special -support routines in the VFS which will locate the required device -driver information. These support routines replace the filesystem file -operations with those for the device driver, and then proceed to call -the new open() method for the file. This is how opening a device file -in the filesystem eventually ends up calling the device driver open() -method. - - -Directory Entry Cache (dcache) -============================== - - -struct dentry_operations ------------------------- - -This describes how a filesystem can overload the standard dentry -operations. Dentries and the dcache are the domain of the VFS and the -individual filesystem implementations. Device drivers have no business -here. These methods may be set to NULL, as they are either optional or -the VFS uses a default. As of kernel 2.6.22, the following members are -defined: - -struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); - int (*d_weak_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, - unsigned int, const char *, const struct qstr *); - int (*d_delete)(const struct dentry *); - int (*d_init)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); - char *(*d_dname)(struct dentry *, char *, int); - struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); -}; - - d_revalidate: called when the VFS needs to revalidate a dentry. This - is called whenever a name look-up finds a dentry in the - dcache. Most local filesystems leave this as NULL, because all their - dentries in the dcache are valid. Network filesystems are different - since things can change on the server without the client necessarily - being aware of it. - - This function should return a positive value if the dentry is still - valid, and zero or a negative error code if it isn't. - - d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). - If in rcu-walk mode, the filesystem must revalidate the dentry without - blocking or storing to the dentry, d_parent and d_inode should not be - used without care (because they can change and, in d_inode case, even - become NULL under us). - - If a situation is encountered that rcu-walk cannot handle, return - -ECHILD and it will be called again in ref-walk mode. - - d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry. - This is called when a path-walk ends at dentry that was not acquired by - doing a lookup in the parent directory. This includes "/", "." and "..", - as well as procfs-style symlinks and mountpoint traversal. - - In this case, we are less concerned with whether the dentry is still - fully correct, but rather that the inode is still valid. As with - d_revalidate, most local filesystems will set this to NULL since their - dcache entries are always valid. - - This function has the same return code semantics as d_revalidate. - - d_weak_revalidate is only called after leaving rcu-walk mode. - - d_hash: called when the VFS adds a dentry to the hash table. The first - dentry passed to d_hash is the parent directory that the name is - to be hashed into. - - Same locking and synchronisation rules as d_compare regarding - what is safe to dereference etc. - - d_compare: called to compare a dentry name with a given name. The first - dentry is the parent of the dentry to be compared, the second is - the child dentry. len and name string are properties of the dentry - to be compared. qstr is the name to compare it with. - - Must be constant and idempotent, and should not take locks if - possible, and should not or store into the dentry. - Should not dereference pointers outside the dentry without - lots of care (eg. d_parent, d_inode, d_name should not be used). - - However, our vfsmount is pinned, and RCU held, so the dentries and - inodes won't disappear, neither will our sb or filesystem module. - ->d_sb may be used. - - It is a tricky calling convention because it needs to be called under - "rcu-walk", ie. without any locks or references on things. - - d_delete: called when the last reference to a dentry is dropped and the - dcache is deciding whether or not to cache it. Return 1 to delete - immediately, or 0 to cache the dentry. Default is NULL which means to - always cache a reachable dentry. d_delete must be constant and - idempotent. - - d_init: called when a dentry is allocated - - d_release: called when a dentry is really deallocated - - d_iput: called when a dentry loses its inode (just prior to its - being deallocated). The default when this is NULL is that the - VFS calls iput(). If you define this method, you must call - iput() yourself - - d_dname: called when the pathname of a dentry should be generated. - Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay - pathname generation. (Instead of doing it when dentry is created, - it's done only when the path is needed.). Real filesystems probably - dont want to use it, because their dentries are present in global - dcache hash, so their hash should be an invariant. As no lock is - held, d_dname() should not try to modify the dentry itself, unless - appropriate SMP safety is used. CAUTION : d_path() logic is quite - tricky. The correct way to return for example "Hello" is to put it - at the end of the buffer, and returns a pointer to the first char. - dynamic_dname() helper function is provided to take care of this. - - Example : - - static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) - { - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); - } - - d_automount: called when an automount dentry is to be traversed (optional). - This should create a new VFS mount record and return the record to the - caller. The caller is supplied with a path parameter giving the - automount directory to describe the automount target and the parent - VFS mount record to provide inheritable mount parameters. NULL should - be returned if someone else managed to make the automount first. If - the vfsmount creation failed, then an error code should be returned. - If -EISDIR is returned, then the directory will be treated as an - ordinary directory and returned to pathwalk to continue walking. - - If a vfsmount is returned, the caller will attempt to mount it on the - mountpoint and will remove the vfsmount from its expiration list in - the case of failure. The vfsmount should be returned with 2 refs on - it to prevent automatic expiration - the caller will clean up the - additional ref. - - This function is only used if DCACHE_NEED_AUTOMOUNT is set on the - dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the - inode being added. - - d_manage: called to allow the filesystem to manage the transition from a - dentry (optional). This allows autofs, for example, to hold up clients - waiting to explore behind a 'mountpoint' while letting the daemon go - past and construct the subtree there. 0 should be returned to let the - calling process continue. -EISDIR can be returned to tell pathwalk to - use this directory as an ordinary directory and to ignore anything - mounted on it and not to check the automount flag. Any other error - code will abort pathwalk completely. - - If the 'rcu_walk' parameter is true, then the caller is doing a - pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, - and the caller can be asked to leave it and call again by returning - -ECHILD. -EISDIR may also be returned to tell pathwalk to - ignore d_automount or any mounts. - - This function is only used if DCACHE_MANAGE_TRANSIT is set on the - dentry being transited from. - - d_real: overlay/union type filesystems implement this method to return one of - the underlying dentries hidden by the overlay. It is used in two - different modes: - - Called from file_dentry() it returns the real dentry matching the inode - argument. The real dentry may be from a lower layer already copied up, - but still referenced from the file. This mode is selected with a - non-NULL inode argument. - - With NULL inode the topmost real underlying dentry is returned. - -Each dentry has a pointer to its parent dentry, as well as a hash list -of child dentries. Child dentries are basically like files in a -directory. - - -Directory Entry Cache API --------------------------- - -There are a number of functions defined which permit a filesystem to -manipulate dentries: - - dget: open a new handle for an existing dentry (this just increments - the usage count) - - dput: close a handle for a dentry (decrements the usage count). If - the usage count drops to 0, and the dentry is still in its - parent's hash, the "d_delete" method is called to check whether - it should be cached. If it should not be cached, or if the dentry - is not hashed, it is deleted. Otherwise cached dentries are put - into an LRU list to be reclaimed on memory shortage. - - d_drop: this unhashes a dentry from its parents hash list. A - subsequent call to dput() will deallocate the dentry if its - usage count drops to 0 - - d_delete: delete a dentry. If there are no other open references to - the dentry then the dentry is turned into a negative dentry - (the d_iput() method is called). If there are other - references, then d_drop() is called instead - - d_add: add a dentry to its parents hash list and then calls - d_instantiate() - - d_instantiate: add a dentry to the alias hash list for the inode and - updates the "d_inode" member. The "i_count" member in the - inode structure should be set/incremented. If the inode - pointer is NULL, the dentry is called a "negative - dentry". This function is commonly called when an inode is - created for an existing negative dentry - - d_lookup: look up a dentry given its parent and path name component - It looks up the child of that given name from the dcache - hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use dput() - to free the dentry when it finishes using it. - - -Mount Options -============= - - -Parsing options ---------------- - -On mount and remount the filesystem is passed a string containing a -comma separated list of mount options. The options can have either of -these forms: - - option - option=value - -The header defines an API that helps parse these -options. There are plenty of examples on how to use it in existing -filesystems. - - -Showing options ---------------- - -If a filesystem accepts mount options, it must define show_options() to -show all the currently active options. The rules are: - - - options MUST be shown which are not default or their values differ - from the default - - - options MAY be shown which are enabled by default or have their - default value - -Options used only internally between a mount helper and the kernel (such -as file descriptors), or which only have an effect during the mounting -(such as ones controlling the creation of a journal) are exempt from the -above rules. - -The underlying reason for the above rules is to make sure, that a mount -can be accurately replicated (e.g. umounting and mounting again) based -on the information found in /proc/mounts. - - -Resources -========= - -(Note some of these resources are not up-to-date with the latest kernel - version.) - -Creating Linux virtual filesystems. 2002 - - -The Linux Virtual File-system Layer by Neil Brown. 1999 - - -A tour of the Linux VFS by Michael K. Johnson. 1996 - - -A small trail through the Linux kernel by Andries Brouwer. 2001 - -- cgit From 0cb93b1503c19f19966402c3ed841626ac9db266 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 May 2019 15:50:10 +0530 Subject: dt-bindings: power: Add rpm power domain bindings for qcs404 Add RPM power domain bindings for the qcs404 family of SoC Reviewed-by: Rob Herring Reviewed-by: Jeffrey Hugo Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson [sibis: Add supported rpmpd states for qcs404] Signed-off-by: Sibi Sankar Signed-off-by: Bjorn Andersson Signed-off-by: Andy Gross --- Documentation/devicetree/bindings/power/qcom,rpmpd.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt index 980e5413d18f..172ccf940c5c 100644 --- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt +++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt @@ -6,6 +6,7 @@ which then translates it into a corresponding voltage on a rail Required Properties: - compatible: Should be one of the following * qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC + * qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC * qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC - #power-domain-cells: number of cells in Power domain specifier must be 1. -- cgit From dec9a05a147e4988e1f743c0d0a1389a0552e322 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Mon, 13 May 2019 15:50:13 +0530 Subject: dt-bindings: power: Add rpm power domain bindings for msm8998 Add RPM power domain bindings for the msm8998 family of SoC Reviewed-by: Rob Herring Reviewed-by: Jeffrey Hugo Reviewed-by: Vinod Koul Signed-off-by: Sibi Sankar Signed-off-by: Bjorn Andersson Signed-off-by: Andy Gross --- Documentation/devicetree/bindings/power/qcom,rpmpd.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt index 172ccf940c5c..eb35b22f9e23 100644 --- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt +++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt @@ -6,6 +6,7 @@ which then translates it into a corresponding voltage on a rail Required Properties: - compatible: Should be one of the following * qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC + * qcom,msm8998-rpmpd: RPM Power domain for the msm8998 family of SoC * qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC * qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC - #power-domain-cells: number of cells in Power domain specifier -- cgit From c920f745f45d4610e08943690301b8055fb5e834 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 28 May 2019 20:38:10 +0300 Subject: net: phy: Add phy_standalone sysfs entry Export a phy_standalone device attribute that is meant to give the indication that this PHY lacks an attached_dev and its corresponding sysfs link. The attribute will be created only when the phy_attach_direct() function will be called with a NULL net_device. Signed-off-by: Ioana Ciornei Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net-phydev | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-class-net-phydev b/Documentation/ABI/testing/sysfs-class-net-phydev index 6ebabfb27912..b7c46c2f3fd0 100644 --- a/Documentation/ABI/testing/sysfs-class-net-phydev +++ b/Documentation/ABI/testing/sysfs-class-net-phydev @@ -34,3 +34,11 @@ Description: xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui, xaui, 10gbase-kr, unknown +What: /sys/class/mdio_bus///phy_standalone +Date: May 2019 +KernelVersion: 5.3 +Contact: netdev@vger.kernel.org +Description: + Boolean value indicating whether the PHY device is used in + standalone mode, without a net_device associated, by PHYLINK. + Attribute created only when this is the case. -- cgit From 44cc27e43fa3b8977373915a8e7f515a9d263343 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 28 May 2019 20:38:12 +0300 Subject: net: phylink: Add struct phylink_config to PHYLINK API The phylink_config structure will encapsulate a pointer to a struct device and the operation type requested for this instance of PHYLINK. This patch does not make any functional changes, it just transitions the PHYLINK internals and all its users to the new API. A pointer to a phylink_config structure will be passed to phylink_create() instead of the net_device directly. Also, the same phylink_config pointer will be passed back to all phylink_mac_ops callbacks instead of the net_device. Using this mechanism, a PHYLINK user can get the original net_device using a structure such as 'to_net_dev(config->dev)' or directly the structure containing the phylink_config using a container_of call. At the moment, only the PHYLINK_NETDEV is defined as a valid operation type for PHYLINK. In this mode, a valid reference to a struct device linked to the original net_device should be passed to PHYLINK through the phylink_config structure. This API changes is mainly driven by the necessity of adding a new operation type in PHYLINK that disconnects the phy_device from the net_device and also works when the net_device is lacking. Signed-off-by: Ioana Ciornei Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Signed-off-by: David S. Miller --- Documentation/networking/sfp-phylink.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst index 5bd26cb07244..91446b431b70 100644 --- a/Documentation/networking/sfp-phylink.rst +++ b/Documentation/networking/sfp-phylink.rst @@ -98,6 +98,7 @@ this documentation. 4. Add:: struct phylink *phylink; + struct phylink_config phylink_config; to the driver's private data structure. We shall refer to the driver's private data pointer as ``priv`` below, and the driver's @@ -223,8 +224,10 @@ this documentation. .. code-block:: c struct phylink *phylink; + priv->phylink_config.dev = &dev.dev; + priv->phylink_config.type = PHYLINK_NETDEV; - phylink = phylink_create(dev, node, phy_mode, &phylink_ops); + phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops); if (IS_ERR(phylink)) { err = PTR_ERR(phylink); fail probe; -- cgit From 84ede58dfcd1db6f04f71dd3ccd5328271b346da Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 20 May 2019 09:54:46 -0700 Subject: crypto: hash - remove CRYPTO_ALG_TYPE_DIGEST Remove the unnecessary constant CRYPTO_ALG_TYPE_DIGEST, which has the same value as CRYPTO_ALG_TYPE_HASH. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- Documentation/crypto/architecture.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst index ee8ff0762d7f..3eae1ae7f798 100644 --- a/Documentation/crypto/architecture.rst +++ b/Documentation/crypto/architecture.rst @@ -208,9 +208,7 @@ the aforementioned cipher types: - CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as an ECDH or DH implementation -- CRYPTO_ALG_TYPE_DIGEST Raw message digest - -- CRYPTO_ALG_TYPE_HASH Alias for CRYPTO_ALG_TYPE_DIGEST +- CRYPTO_ALG_TYPE_HASH Raw message digest - CRYPTO_ALG_TYPE_SHASH Synchronous multi-block hash -- cgit From 2af8c8927ebb474eb2af4ab3b8d7e92c3844b43f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 May 2019 18:26:50 +0200 Subject: dt-bindings: add Atmel SHA204A I2C crypto processor Add a compatible string for the Atmel SHA204A I2C crypto processor. Signed-off-by: Ard Biesheuvel Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 747fd3f689dc..a572c3468226 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -52,6 +52,8 @@ properties: - at,24c08 # i2c trusted platform module (TPM) - atmel,at97sc3204t + # i2c h/w symmetric crypto module + - atmel,atsha204a # CM32181: Ambient Light Sensor - capella,cm32181 # CM3232: Ambient Light Sensor -- cgit From 0adb0c99594b73844cf9a5714faa6553ea04ba04 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 24 May 2019 18:26:51 +0200 Subject: dt-bindings: move Atmel ECC508A I2C crypto processor to trivial-devices Move the binding for the discrete Atmel I2C Elliptic Curve h/w crypto module to trivial-devices.yaml, as it doesn't belong in atmel-crypto which describes unrelated on-SoC peripherals. Signed-off-by: Ard Biesheuvel Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/crypto/atmel-crypto.txt | 13 ------------- Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++ 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt index 6b458bb2440d..f2aab3dc2b52 100644 --- a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt +++ b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt @@ -66,16 +66,3 @@ sha@f8034000 { dmas = <&dma1 2 17>; dma-names = "tx"; }; - -* Eliptic Curve Cryptography (I2C) - -Required properties: -- compatible : must be "atmel,atecc508a". -- reg: I2C bus address of the device. -- clock-frequency: must be present in the i2c controller node. - -Example: -atecc508a@c0 { - compatible = "atmel,atecc508a"; - reg = <0xC0>; -}; diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index a572c3468226..2e742d399e87 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -54,6 +54,8 @@ properties: - atmel,at97sc3204t # i2c h/w symmetric crypto module - atmel,atsha204a + # i2c h/w elliptic curve crypto module + - atmel,atecc508a # CM32181: Ambient Light Sensor - capella,cm32181 # CM3232: Ambient Light Sensor -- cgit From ae683c816d9f4aadec3eaf1da69a04c07ca9fd3f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 25 May 2019 15:41:39 +0200 Subject: ARM: dts: sun7i: icnova-swac: Fix the model vendor Even though the SWAC is just a baseboard to the icnova SoM, the vendor of the baseboard somehow ended up with the board name instead of the vendor name. Fix that. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/arm/sunxi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml index 285f4fc8519d..000a00d12d6a 100644 --- a/Documentation/devicetree/bindings/arm/sunxi.yaml +++ b/Documentation/devicetree/bindings/arm/sunxi.yaml @@ -263,7 +263,7 @@ properties: - description: ICNova A20 SWAC items: - - const: swac,icnova-a20-swac + - const: incircuit,icnova-a20-swac - const: incircuit,icnova-a20 - const: allwinner,sun7i-a20 -- cgit From c42eaffa16568a538f12dfebd99624659992913a Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:23 +0800 Subject: Documentation: add Linux PCI to Sphinx TOC tree Add index.rst for PCI subsystem. More docs will be added later. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas --- Documentation/PCI/index.rst | 9 +++++++++ Documentation/index.rst | 1 + 2 files changed, 10 insertions(+) create mode 100644 Documentation/PCI/index.rst (limited to 'Documentation') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst new file mode 100644 index 000000000000..c2f8728d11cf --- /dev/null +++ b/Documentation/PCI/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +Linux PCI Bus Subsystem +======================= + +.. toctree:: + :maxdepth: 2 + :numbered: diff --git a/Documentation/index.rst b/Documentation/index.rst index a7566ef62411..4afa431d9b1f 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -101,6 +101,7 @@ needed). filesystems/index vm/index bpf/index + PCI/index misc-devices/index Architecture-specific documentation -- cgit From 4d11d13e27762413399ab6f8dc49d30911cac17a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 May 2019 20:08:40 -0700 Subject: f2fs: add missing sysfs entries in documentation This patch cleans up documentation to cover missing sysfs entries. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 89 ++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index f7b5e4ff0de3..66aca042988e 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -246,11 +246,14 @@ Files in /sys/fs/f2fs/ .............................................................................. File Content - gc_max_sleep_time This tuning parameter controls the maximum sleep + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + + gc_min_sleep_time This tuning parameter controls the minimum sleep time for the garbage collection thread. Time is in milliseconds. - gc_min_sleep_time This tuning parameter controls the minimum sleep + gc_max_sleep_time This tuning parameter controls the maximum sleep time for the garbage collection thread. Time is in milliseconds. @@ -270,9 +273,6 @@ Files in /sys/fs/f2fs/ to 1, background thread starts to do GC by given gc_urgent_sleep_time interval. - gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. - 500 ms is set by default. See above gc_urgent. - reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments @@ -287,7 +287,16 @@ Files in /sys/fs/f2fs/ checkpoint is triggered, and issued during the checkpoint. By default, it is disabled with 0. - trim_sections This parameter controls the number of sections + discard_granularity This parameter controls the granularity of discard + command size. It will issue discard commands iif + the size is larger than given granularity. Its + unit size is 4KB, and 4 (=16KB) is set by default. + The maximum value is 128 (=512KB). + + reserved_blocks This parameter indicates the number of blocks that + f2fs reserves internally for root. + + batched_trim_sections This parameter controls the number of sections to be trimmed out in batch mode when FITRIM conducts. 32 sections is set by default. @@ -309,11 +318,35 @@ Files in /sys/fs/f2fs/ the number is less than this value, it triggers in-place-updates. + min_seq_blocks This parameter controls the threshold to serialize + write IOs issued by multiple threads in parallel. + + min_hot_blocks This parameter controls the threshold to allocate + a hot data log for pending data blocks to write. + + min_ssr_sections This parameter adds the threshold when deciding + SSR block allocation. If this is large, SSR mode + will be enabled early. + + ram_thresh This parameter controls the memory footprint used + by free nids and cached nat entries. By default, + 10 is set, which indicates 10 MB / 1 GB RAM. + + ra_nid_pages When building free nids, F2FS reads NAT blocks + ahead for speed up. Default is 0. + + dirty_nats_ratio Given dirty ratio of cached nat entries, F2FS + determines flushing them in background. + max_victim_search This parameter controls the number of trials to find a victim segment when conducting SSR and cleaning operations. The default value is 4096 which covers 8GB block address range. + migration_granularity For large-sized sections, F2FS can stop GC given + this granularity instead of reclaiming entire + section. + dir_level This parameter controls the directory level to support large directory. If a directory has a number of files, it can reduce the file lookup @@ -321,9 +354,47 @@ Files in /sys/fs/f2fs/ Otherwise, it needs to decrease this value to reduce the space overhead. The default value is 0. - ram_thresh This parameter controls the memory footprint used - by free nids and cached nat entries. By default, - 10 is set, which indicates 10 MB / 1 GB RAM. + cp_interval F2FS tries to do checkpoint periodically, 60 secs + by default. + + idle_interval F2FS detects system is idle, if there's no F2FS + operations during given interval, 5 secs by + default. + + discard_idle_interval F2FS detects the discard thread is idle, given + time interval. Default is 5 secs. + + gc_idle_interval F2FS detects the GC thread is idle, given time + interval. Default is 5 secs. + + umount_discard_timeout When unmounting the disk, F2FS waits for finishing + queued discard commands which can take huge time. + This gives time out for it, 5 secs by default. + + iostat_enable This controls to enable/disable iostat in F2FS. + + readdir_ra This enables/disabled readahead of inode blocks + in readdir, and default is enabled. + + gc_pin_file_thresh This indicates how many GC can be failed for the + pinned file. If it exceeds this, F2FS doesn't + guarantee its pinning state. 2048 trials is set + by default. + + extension_list This enables to change extension_list for hot/cold + files in runtime. + + inject_rate This controls injection rate of arbitrary faults. + + inject_type This controls injection type of arbitrary faults. + + dirty_segments This shows # of dirty segments. + + lifetime_write_kbytes This shows # of data written to the disk. + + features This shows current features enabled on F2FS. + + current_reserved_blocks This shows # of blocks currently reserved. ================================================================================ USAGE -- cgit From 9b88ad5464af1bf7228991f1c46a9a13484790a4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 May 2019 20:09:26 -0300 Subject: scripts/sphinx-pre-install: always check if version is compatible with build Call the script every time a make docs target is selected, on a simplified check mode. With this change, the script will set two vars: $min_version - obtained from `needs_sphinx` var inside conf.py (currently, '1.3') $rec_version - obtained from sphinx/requirements.txt. With those changes, a target like "make htmldocs" will do: 1) If no sphinx-build/sphinx-build3 is found, it will run the script on normal mode as before, checking for all system dependencies and providing install hints for the needed programs and will abort the build; 2) If no sphinx-build/sphinx-build3 is found, but there is a sphinx_${VER}/bin/activate file, and if ${VER} >= $min_version (string comparation), it will run in full mode, and will recommend to activate the virtualenv. If there are multiple virtualenvs, it will string sort the versions, recommending the highest version and will abort the build; 3) If Sphinx is detected but has a version lower than $min_version, it will run in full mode - with will recommend creating a virtual env using sphinx/requirements.txt, and will abort the build. 4) If Sphinx is detected and version is lower than $rec_version, it will run in full mode and will recommend creating a virtual env using sphinx/requirements.txt. In this case, it **won't** abort the build. 5) If Sphinx is detected and version is equal or righer than $rec_version it will return just after detecting the version ("quick mode"), not checking if are there any missing dependencies. Just like before, if one wants to install Sphinx from the distro, it has to call the script manually and use `--no-virtualenv` argument to get the hints for his OS: You should run: sudo dnf install -y python3-sphinx python3-sphinx_rtd_theme While here, add a small help for the three optional arguments for the script. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/Makefile b/Documentation/Makefile index e889e7cb8511..380e24053d6f 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -70,12 +70,14 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) $(abspath $(BUILDDIR)/$3/$4) htmldocs: + @./scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) linkcheckdocs: @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) latexdocs: + @./scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) ifeq ($(HAVE_PDFLATEX),0) @@ -87,14 +89,17 @@ pdfdocs: else # HAVE_PDFLATEX pdfdocs: latexdocs + @./scripts/sphinx-pre-install --version-check $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) endif # HAVE_PDFLATEX epubdocs: + @./scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) xmldocs: + @./scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) endif # HAVE_SPHINX -- cgit From cf08508d21ffae5aea6c7dcb771ebd28612c6120 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 May 2019 20:09:31 -0300 Subject: docs: by default, build docs a lot faster with Sphinx >= 1.7 Since Sphinx version 1.7, it is possible to use "-jauto" in order to speedup documentation builds. On older versions, while -j was already supported, one would need to set the number of threads manually. So, if SPHINXOPTS is not provided, add -jauto, in order to speed up the build. That makes it *a lot* times faster than without -j. If one really wants to slow things down, it can just use: make SPHINXOPTS=-j1 htmldocs Signed-off-by: Mauro Carvalho Chehab [ jc: fixed perl magic to determine sphinx version ] Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/Makefile b/Documentation/Makefile index 380e24053d6f..85d3cfafd77c 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -28,6 +28,8 @@ ifeq ($(HAVE_SPHINX),0) else # HAVE_SPHINX +export SPHINXOPTS = $(shell perl -e 'open IN,"sphinx-build --version 2>&1 |"; while () { if (m/([\d\.]+)/) { print "-jauto" if ($$1 >= "1.7") } ;} close IN') + # User-friendly check for pdflatex and latexmk HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi) -- cgit From a700767a7682d9bd237e927253274859aee075e7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 May 2019 20:09:32 -0300 Subject: docs: requirements.txt: recommend Sphinx 1.7.9 As discussed at the linux-doc ML, while we'll still support version 1.3, it is time to recommend a more modern version. So, let's switch the minimal requirements to Sphinx 1.7.9, as it has the "-jauto" flag, with makes a lot faster when building documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/sphinx.rst | 17 ++++++++--------- Documentation/sphinx/requirements.txt | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst index c039224b404e..4ba081f43e98 100644 --- a/Documentation/doc-guide/sphinx.rst +++ b/Documentation/doc-guide/sphinx.rst @@ -27,8 +27,7 @@ Sphinx Install ============== The ReST markups currently used by the Documentation/ files are meant to be -built with ``Sphinx`` version 1.3 or higher. If you desire to build -PDF output, it is recommended to use version 1.4.6 or higher. +built with ``Sphinx`` version 1.3 or higher. There's a script that checks for the Sphinx requirements. Please see :ref:`sphinx-pre-install` for further details. @@ -56,13 +55,13 @@ or ``virtualenv``, depending on how your distribution packaged Python 3. those expressions are written using LaTeX notation. It needs texlive installed with amdfonts and amsmath in order to evaluate them. -In summary, if you want to install Sphinx version 1.4.9, you should do:: +In summary, if you want to install Sphinx version 1.7.9, you should do:: - $ virtualenv sphinx_1.4 - $ . sphinx_1.4/bin/activate - (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_1.7.9 + $ . sphinx_1.7.9/bin/activate + (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt -After running ``. sphinx_1.4/bin/activate``, the prompt will change, +After running ``. sphinx_1.7.9/bin/activate``, the prompt will change, in order to indicate that you're using the new environment. If you open a new shell, you need to rerun this command to enter again at the virtual environment before building the documentation. @@ -105,8 +104,8 @@ command line options for your distro:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.4 - . sphinx_1.4/bin/activate + /usr/bin/virtualenv sphinx_1.7.9 + . sphinx_1.7.9/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt index 742be3e12619..14e29a0ae480 100644 --- a/Documentation/sphinx/requirements.txt +++ b/Documentation/sphinx/requirements.txt @@ -1,3 +1,3 @@ -docutils==0.12 -Sphinx==1.4.9 +docutils +Sphinx==1.7.9 sphinx_rtd_theme -- cgit From 2dc7e48dee68219dd8921824473306036ea45152 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 29 May 2019 12:34:00 -0400 Subject: Documentation: ip-sysctl.txt: Document tcp_fastopen_key Add docs for /proc/sys/net/ipv4/tcp_fastopen_key Signed-off-by: Jason Baron Signed-off-by: Christoph Paasch Cc: Jeremy Sowden Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 14fe93049d28..a73b3a02e49a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -648,6 +648,26 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER 0 to disable the blackhole detection. By default, it is set to 1hr. +tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs + The list consists of a primary key and an optional backup key. The + primary key is used for both creating and validating cookies, while the + optional backup key is only used for validating cookies. The purpose of + the backup key is to maximize TFO validation when keys are rotated. + + A randomly chosen primary key may be configured by the kernel if + the tcp_fastopen sysctl is set to 0x400 (see above), or if the + TCP_FASTOPEN setsockopt() optname is set and a key has not been + previously configured via sysctl. If keys are configured via + setsockopt() by using the TCP_FASTOPEN_KEY optname, then those + per-socket keys will be used instead of any keys that are specified via + sysctl. + + A key is specified as 4 8-digit hexadecimal integers which are separated + by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be + omitted. A primary and a backup key may be specified by separating them + by a comma. If only one key is specified, it becomes the primary key and + any previously configured backup keys are removed. + tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt will be retransmitted. Should not be higher than 127. Default value -- cgit From ed0ac5c7ec3763e3261c48e3c5d4b7528b60fd85 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 May 2019 21:51:50 +0100 Subject: keys: Add a keyctl to move a key between keyrings Add a keyctl to atomically move a link to a key from one keyring to another. The key must exist in "from" keyring and a flag can be given to cause the operation to fail if there's a matching key already in the "to" keyring. This can be done with: keyctl(KEYCTL_MOVE, key_serial_t key, key_serial_t from_keyring, key_serial_t to_keyring, unsigned int flags); The key being moved must grant Link permission and both keyrings must grant Write permission. flags should be 0 or KEYCTL_MOVE_EXCL, with the latter preventing displacement of a matching key from the "to" keyring. Signed-off-by: David Howells --- Documentation/security/keys/core.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'Documentation') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 9521c4207f01..823d29bf44f7 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -577,6 +577,27 @@ The keyctl syscall functions are: added. + * Move a key from one keyring to another:: + + long keyctl(KEYCTL_MOVE, + key_serial_t id, + key_serial_t from_ring_id, + key_serial_t to_ring_id, + unsigned int flags); + + Move the key specified by "id" from the keyring specified by + "from_ring_id" to the keyring specified by "to_ring_id". If the two + keyrings are the same, nothing is done. + + "flags" can have KEYCTL_MOVE_EXCL set in it to cause the operation to fail + with EEXIST if a matching key exists in the destination keyring, otherwise + such a key will be replaced. + + A process must have link permission on the key for this function to be + successful and write permission on both keyrings. Any errors that can + occur from KEYCTL_LINK also apply on the destination keyring here. + + * Unlink a key or keyring from another keyring:: long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key); -- cgit From 229b4e0728e0a6ddca2645e73696d5b104fbbbfb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:24 +0800 Subject: Documentation: PCI: convert pci.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Move the description of struct pci_driver and struct pci_device_id into in-source comments. Signed-off-by: Changbin Du [bhelgaas: fix kernel-doc warnings related to moving descriptions to linux/pci.h, fix "space tab" whitespace errors in mod_devicetable.h] Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/index.rst | 2 + Documentation/PCI/pci.rst | 578 ++++++++++++++++++++++++++++++++++++++++ Documentation/PCI/pci.txt | 636 -------------------------------------------- 3 files changed, 580 insertions(+), 636 deletions(-) create mode 100644 Documentation/PCI/pci.rst delete mode 100644 Documentation/PCI/pci.txt (limited to 'Documentation') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index c2f8728d11cf..7babf43709b0 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -7,3 +7,5 @@ Linux PCI Bus Subsystem .. toctree:: :maxdepth: 2 :numbered: + + pci diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst new file mode 100644 index 000000000000..6864f9a70f5f --- /dev/null +++ b/Documentation/PCI/pci.rst @@ -0,0 +1,578 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +How To Write Linux PCI Drivers +============================== + +:Authors: - Martin Mares + - Grant Grundler + +The world of PCI is vast and full of (mostly unpleasant) surprises. +Since each CPU architecture implements different chip-sets and PCI devices +have different requirements (erm, "features"), the result is the PCI support +in the Linux kernel is not as trivial as one would wish. This short paper +tries to introduce all potential driver authors to Linux APIs for +PCI device drivers. + +A more complete resource is the third edition of "Linux Device Drivers" +by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman. +LDD3 is available for free (under Creative Commons License) from: +http://lwn.net/Kernel/LDD3/. + +However, keep in mind that all documents are subject to "bit rot". +Refer to the source code if things are not working as described here. + +Please send questions/comments/patches about Linux PCI API to the +"Linux PCI" mailing list. + + +Structure of PCI drivers +======================== +PCI drivers "discover" PCI devices in a system via pci_register_driver(). +Actually, it's the other way around. When the PCI generic code discovers +a new device, the driver with a matching "description" will be notified. +Details on this below. + +pci_register_driver() leaves most of the probing for devices to +the PCI layer and supports online insertion/removal of devices [thus +supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver]. +pci_register_driver() call requires passing in a table of function +pointers and thus dictates the high level structure of a driver. + +Once the driver knows about a PCI device and takes ownership, the +driver generally needs to perform the following initialization: + + - Enable the device + - Request MMIO/IOP resources + - Set the DMA mask size (for both coherent and streaming DMA) + - Allocate and initialize shared control data (pci_allocate_coherent()) + - Access device configuration space (if needed) + - Register IRQ handler (request_irq()) + - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) + - Enable DMA/processing engines + +When done using the device, and perhaps the module needs to be unloaded, +the driver needs to take the follow steps: + + - Disable the device from generating IRQs + - Release the IRQ (free_irq()) + - Stop all DMA activity + - Release DMA buffers (both streaming and coherent) + - Unregister from other subsystems (e.g. scsi or netdev) + - Release MMIO/IOP resources + - Disable the device + +Most of these topics are covered in the following sections. +For the rest look at LDD3 or . + +If the PCI subsystem is not configured (CONFIG_PCI is not set), most of +the PCI functions described below are defined as inline functions either +completely empty or just returning an appropriate error codes to avoid +lots of ifdefs in the drivers. + + +pci_register_driver() call +========================== + +PCI device drivers call ``pci_register_driver()`` during their +initialization with a pointer to a structure describing the driver +(``struct pci_driver``): + +.. kernel-doc:: include/linux/pci.h + :functions: pci_driver + +The ID table is an array of ``struct pci_device_id`` entries ending with an +all-zero entry. Definitions with static const are generally preferred. + +.. kernel-doc:: include/linux/mod_devicetable.h + :functions: pci_device_id + +Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up +a pci_device_id table. + +New PCI IDs may be added to a device driver pci_ids table at runtime +as shown below:: + + echo "vendor device subvendor subdevice class class_mask driver_data" > \ + /sys/bus/pci/drivers/{driver}/new_id + +All fields are passed in as hexadecimal values (no leading 0x). +The vendor and device fields are mandatory, the others are optional. Users +need pass only as many optional fields as necessary: + + - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) + - class and classmask fields default to 0 + - driver_data defaults to 0UL. + +Note that driver_data must match the value used by any of the pci_device_id +entries defined in the driver. This makes the driver_data field mandatory +if all the pci_device_id entries have a non-zero driver_data value. + +Once added, the driver probe routine will be invoked for any unclaimed +PCI devices listed in its (newly updated) pci_ids list. + +When the driver exits, it just calls pci_unregister_driver() and the PCI layer +automatically calls the remove hook for all devices handled by the driver. + + +"Attributes" for driver functions/data +-------------------------------------- + +Please mark the initialization and cleanup functions where appropriate +(the corresponding macros are defined in ): + + ====== ================================================= + __init Initialization code. Thrown away after the driver + initializes. + __exit Exit code. Ignored for non-modular drivers. + ====== ================================================= + +Tips on when/where to use the above attributes: + - The module_init()/module_exit() functions (and all + initialization functions called _only_ from these) + should be marked __init/__exit. + + - Do not mark the struct pci_driver. + + - Do NOT mark a function if you are not sure which mark to use. + Better to not mark the function than mark the function wrong. + + +How to find PCI devices manually +================================ + +PCI drivers should have a really good reason for not using the +pci_register_driver() interface to search for PCI devices. +The main reason PCI devices are controlled by multiple drivers +is because one PCI device implements several different HW services. +E.g. combined serial/parallel port/floppy controller. + +A manual search may be performed using the following constructs: + +Searching by vendor and device ID:: + + struct pci_dev *dev = NULL; + while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev)) + configure_device(dev); + +Searching by class ID (iterate in a similar way):: + + pci_get_class(CLASS_ID, dev) + +Searching by both vendor/device and subsystem vendor/device ID:: + + pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev). + +You can use the constant PCI_ANY_ID as a wildcard replacement for +VENDOR_ID or DEVICE_ID. This allows searching for any device from a +specific vendor, for example. + +These functions are hotplug-safe. They increment the reference count on +the pci_dev that they return. You must eventually (possibly at module unload) +decrement the reference count on these devices by calling pci_dev_put(). + + +Device Initialization Steps +=========================== + +As noted in the introduction, most PCI drivers need the following steps +for device initialization: + + - Enable the device + - Request MMIO/IOP resources + - Set the DMA mask size (for both coherent and streaming DMA) + - Allocate and initialize shared control data (pci_allocate_coherent()) + - Access device configuration space (if needed) + - Register IRQ handler (request_irq()) + - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) + - Enable DMA/processing engines. + +The driver can access PCI config space registers at any time. +(Well, almost. When running BIST, config space can go away...but +that will just result in a PCI Bus Master Abort and config reads +will return garbage). + + +Enable the PCI device +--------------------- +Before touching any device registers, the driver needs to enable +the PCI device by calling pci_enable_device(). This will: + + - wake up the device if it was in suspended state, + - allocate I/O and memory regions of the device (if BIOS did not), + - allocate an IRQ (if BIOS did not). + +.. note:: + pci_enable_device() can fail! Check the return value. + +.. warning:: + OS BUG: we don't check resource allocations before enabling those + resources. The sequence would make more sense if we called + pci_request_resources() before calling pci_enable_device(). + Currently, the device drivers can't detect the bug when when two + devices have been allocated the same range. This is not a common + problem and unlikely to get fixed soon. + + This has been discussed before but not changed as of 2.6.19: + http://lkml.org/lkml/2006/3/2/194 + + +pci_set_master() will enable DMA by setting the bus master bit +in the PCI_COMMAND register. It also fixes the latency timer value if +it's set to something bogus by the BIOS. pci_clear_master() will +disable DMA by clearing the bus master bit. + +If the PCI device can use the PCI Memory-Write-Invalidate transaction, +call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval +and also ensures that the cache line size register is set correctly. +Check the return value of pci_set_mwi() as not all architectures +or chip-sets may support Memory-Write-Invalidate. Alternatively, +if Mem-Wr-Inval would be nice to have but is not required, call +pci_try_set_mwi() to have the system do its best effort at enabling +Mem-Wr-Inval. + + +Request MMIO/IOP resources +-------------------------- +Memory (MMIO), and I/O port addresses should NOT be read directly +from the PCI device config space. Use the values in the pci_dev structure +as the PCI "bus address" might have been remapped to a "host physical" +address by the arch/chip-set specific kernel support. + +See Documentation/io-mapping.txt for how to access device registers +or device memory. + +The device driver needs to call pci_request_region() to verify +no other device is already using the same address resource. +Conversely, drivers should call pci_release_region() AFTER +calling pci_disable_device(). +The idea is to prevent two devices colliding on the same address range. + +.. tip:: + See OS BUG comment above. Currently (2.6.19), The driver can only + determine MMIO and IO Port resource availability _after_ calling + pci_enable_device(). + +Generic flavors of pci_request_region() are request_mem_region() +(for MMIO ranges) and request_region() (for IO Port ranges). +Use these for address resources that are not described by "normal" PCI +BARs. + +Also see pci_request_selected_regions() below. + + +Set the DMA mask size +--------------------- +.. note:: + If anything below doesn't make sense, please refer to + Documentation/DMA-API.txt. This section is just a reminder that + drivers need to indicate DMA capabilities of the device and is not + an authoritative source for DMA interfaces. + +While all drivers should explicitly indicate the DMA capability +(e.g. 32 or 64 bit) of the PCI bus master, devices with more than +32-bit bus master capability for streaming data need the driver +to "register" this capability by calling pci_set_dma_mask() with +appropriate parameters. In general this allows more efficient DMA +on systems where System RAM exists above 4G _physical_ address. + +Drivers for all PCI-X and PCIe compliant devices must call +pci_set_dma_mask() as they are 64-bit DMA devices. + +Similarly, drivers must also "register" this capability if the device +can directly address "consistent memory" in System RAM above 4G physical +address by calling pci_set_consistent_dma_mask(). +Again, this includes drivers for all PCI-X and PCIe compliant devices. +Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are +64-bit DMA capable for payload ("streaming") data but not control +("consistent") data. + + +Setup shared control data +------------------------- +Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared) +memory. See Documentation/DMA-API.txt for a full description of +the DMA APIs. This section is just a reminder that it needs to be done +before enabling DMA on the device. + + +Initialize device registers +--------------------------- +Some drivers will need specific "capability" fields programmed +or other "vendor specific" register initialized or reset. +E.g. clearing pending interrupts. + + +Register IRQ handler +-------------------- +While calling request_irq() is the last step described here, +this is often just another intermediate step to initialize a device. +This step can often be deferred until the device is opened for use. + +All interrupt handlers for IRQ lines should be registered with IRQF_SHARED +and use the devid to map IRQs to devices (remember that all PCI IRQ lines +can be shared). + +request_irq() will associate an interrupt handler and device handle +with an interrupt number. Historically interrupt numbers represent +IRQ lines which run from the PCI device to the Interrupt controller. +With MSI and MSI-X (more below) the interrupt number is a CPU "vector". + +request_irq() also enables the interrupt. Make sure the device is +quiesced and does not have any interrupts pending before registering +the interrupt handler. + +MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts" +which deliver interrupts to the CPU via a DMA write to a Local APIC. +The fundamental difference between MSI and MSI-X is how multiple +"vectors" get allocated. MSI requires contiguous blocks of vectors +while MSI-X can allocate several individual ones. + +MSI capability can be enabled by calling pci_alloc_irq_vectors() with the +PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This +causes the PCI support to program CPU vector data into the PCI device +capability registers. Many architectures, chip-sets, or BIOSes do NOT +support MSI or MSI-X and a call to pci_alloc_irq_vectors with just +the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always +specify PCI_IRQ_LEGACY as well. + +Drivers that have different interrupt handlers for MSI/MSI-X and +legacy INTx should chose the right one based on the msi_enabled +and msix_enabled flags in the pci_dev structure after calling +pci_alloc_irq_vectors. + +There are (at least) two really good reasons for using MSI: + +1) MSI is an exclusive interrupt vector by definition. + This means the interrupt handler doesn't have to verify + its device caused the interrupt. + +2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed + to be visible to the host CPU(s) when the MSI is delivered. This + is important for both data coherency and avoiding stale control data. + This guarantee allows the driver to omit MMIO reads to flush + the DMA stream. + +See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples +of MSI/MSI-X usage. + + +PCI device shutdown +=================== + +When a PCI device driver is being unloaded, most of the following +steps need to be performed: + + - Disable the device from generating IRQs + - Release the IRQ (free_irq()) + - Stop all DMA activity + - Release DMA buffers (both streaming and consistent) + - Unregister from other subsystems (e.g. scsi or netdev) + - Disable device from responding to MMIO/IO Port addresses + - Release MMIO/IO Port resource(s) + + +Stop IRQs on the device +----------------------- +How to do this is chip/device specific. If it's not done, it opens +the possibility of a "screaming interrupt" if (and only if) +the IRQ is shared with another device. + +When the shared IRQ handler is "unhooked", the remaining devices +using the same IRQ line will still need the IRQ enabled. Thus if the +"unhooked" device asserts IRQ line, the system will respond assuming +it was one of the remaining devices asserted the IRQ line. Since none +of the other devices will handle the IRQ, the system will "hang" until +it decides the IRQ isn't going to get handled and masks the IRQ (100,000 +iterations later). Once the shared IRQ is masked, the remaining devices +will stop functioning properly. Not a nice situation. + +This is another reason to use MSI or MSI-X if it's available. +MSI and MSI-X are defined to be exclusive interrupts and thus +are not susceptible to the "screaming interrupt" problem. + + +Release the IRQ +--------------- +Once the device is quiesced (no more IRQs), one can call free_irq(). +This function will return control once any pending IRQs are handled, +"unhook" the drivers IRQ handler from that IRQ, and finally release +the IRQ if no one else is using it. + + +Stop all DMA activity +--------------------- +It's extremely important to stop all DMA operations BEFORE attempting +to deallocate DMA control data. Failure to do so can result in memory +corruption, hangs, and on some chip-sets a hard crash. + +Stopping DMA after stopping the IRQs can avoid races where the +IRQ handler might restart DMA engines. + +While this step sounds obvious and trivial, several "mature" drivers +didn't get this step right in the past. + + +Release DMA buffers +------------------- +Once DMA is stopped, clean up streaming DMA first. +I.e. unmap data buffers and return buffers to "upstream" +owners if there is one. + +Then clean up "consistent" buffers which contain the control data. + +See Documentation/DMA-API.txt for details on unmapping interfaces. + + +Unregister from other subsystems +-------------------------------- +Most low level PCI device drivers support some other subsystem +like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your +driver isn't losing resources from that other subsystem. +If this happens, typically the symptom is an Oops (panic) when +the subsystem attempts to call into a driver that has been unloaded. + + +Disable Device from responding to MMIO/IO Port addresses +-------------------------------------------------------- +io_unmap() MMIO or IO Port resources and then call pci_disable_device(). +This is the symmetric opposite of pci_enable_device(). +Do not access device registers after calling pci_disable_device(). + + +Release MMIO/IO Port Resource(s) +-------------------------------- +Call pci_release_region() to mark the MMIO or IO Port range as available. +Failure to do so usually results in the inability to reload the driver. + + +How to access PCI config space +============================== + +You can use `pci_(read|write)_config_(byte|word|dword)` to access the config +space of a device represented by `struct pci_dev *`. All these functions return +0 when successful or an error code (`PCIBIOS_...`) which can be translated to a +text string by pcibios_strerror. Most drivers expect that accesses to valid PCI +devices don't fail. + +If you don't have a struct pci_dev available, you can call +`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device +and function on that bus. + +If you access fields in the standard portion of the config header, please +use symbolic names of locations and bits declared in . + +If you need to access Extended PCI Capability registers, just call +pci_find_capability() for the particular capability and it will find the +corresponding register block for you. + + +Other interesting functions +=========================== + +============================= ================================================ +pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain, + bus and slot and number. If the device is + found, its reference count is increased. +pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3) +pci_find_capability() Find specified capability in device's capability + list. +pci_resource_start() Returns bus start address for a given PCI region +pci_resource_end() Returns bus end address for a given PCI region +pci_resource_len() Returns the byte length of a PCI region +pci_set_drvdata() Set private driver data pointer for a pci_dev +pci_get_drvdata() Return private driver data pointer for a pci_dev +pci_set_mwi() Enable Memory-Write-Invalidate transactions. +pci_clear_mwi() Disable Memory-Write-Invalidate transactions. +============================= ================================================ + + +Miscellaneous hints +=================== + +When displaying PCI device names to the user (for example when a driver wants +to tell the user what card has it found), please use pci_name(pci_dev). + +Always refer to the PCI devices by a pointer to the pci_dev structure. +All PCI layer functions use this identification and it's the only +reasonable one. Don't use bus/slot/function numbers except for very +special purposes -- on systems with multiple primary buses their semantics +can be pretty complex. + +Don't try to turn on Fast Back to Back writes in your driver. All devices +on the bus need to be capable of doing it, so this is something which needs +to be handled by platform and generic code, not individual drivers. + + +Vendor and device identifications +================================= + +Do not add new device or vendor IDs to include/linux/pci_ids.h unless they +are shared across multiple drivers. You can add private definitions in +your driver if they're helpful, or just use plain hex constants. + +The device IDs are arbitrary hex numbers (vendor controlled) and normally used +only in a single location, the pci_device_id table. + +Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/. +There are mirrors of the pci.ids file at http://pciids.sourceforge.net/ +and https://github.com/pciutils/pciids. + + +Obsolete functions +================== + +There are several functions which you might come across when trying to +port an old driver to the new PCI interface. They are no longer present +in the kernel as they aren't compatible with hotplug or PCI domains or +having sane locking. + +================= =========================================== +pci_find_device() Superseded by pci_get_device() +pci_find_subsys() Superseded by pci_get_subsys() +pci_find_slot() Superseded by pci_get_domain_bus_and_slot() +pci_get_slot() Superseded by pci_get_domain_bus_and_slot() +================= =========================================== + +The alternative is the traditional PCI device driver that walks PCI +device lists. This is still possible but discouraged. + + +MMIO Space and "Write Posting" +============================== + +Converting a driver from using I/O Port space to using MMIO space +often requires some additional changes. Specifically, "write posting" +needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2) +already do this. I/O Port space guarantees write transactions reach the PCI +device before the CPU can continue. Writes to MMIO space allow the CPU +to continue before the transaction reaches the PCI device. HW weenies +call this "Write Posting" because the write completion is "posted" to +the CPU before the transaction has reached its destination. + +Thus, timing sensitive code should add readl() where the CPU is +expected to wait before doing other work. The classic "bit banging" +sequence works fine for I/O Port space:: + + for (i = 8; --i; val >>= 1) { + outb(val & 1, ioport_reg); /* write bit */ + udelay(10); + } + +The same sequence for MMIO space should be:: + + for (i = 8; --i; val >>= 1) { + writeb(val & 1, mmio_reg); /* write bit */ + readb(safe_mmio_reg); /* flush posted write */ + udelay(10); + } + +It is important that "safe_mmio_reg" not have any side effects that +interferes with the correct operation of the device. + +Another case to watch out for is when resetting a PCI device. Use PCI +Configuration space reads to flush the writel(). This will gracefully +handle the PCI master abort on all platforms if the PCI device is +expected to not respond to a readl(). Most x86 platforms will allow +MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage +(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail"). diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt deleted file mode 100644 index badb26ac33dc..000000000000 --- a/Documentation/PCI/pci.txt +++ /dev/null @@ -1,636 +0,0 @@ - - How To Write Linux PCI Drivers - - by Martin Mares on 07-Feb-2000 - updated by Grant Grundler on 23-Dec-2006 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The world of PCI is vast and full of (mostly unpleasant) surprises. -Since each CPU architecture implements different chip-sets and PCI devices -have different requirements (erm, "features"), the result is the PCI support -in the Linux kernel is not as trivial as one would wish. This short paper -tries to introduce all potential driver authors to Linux APIs for -PCI device drivers. - -A more complete resource is the third edition of "Linux Device Drivers" -by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman. -LDD3 is available for free (under Creative Commons License) from: - - http://lwn.net/Kernel/LDD3/ - -However, keep in mind that all documents are subject to "bit rot". -Refer to the source code if things are not working as described here. - -Please send questions/comments/patches about Linux PCI API to the -"Linux PCI" mailing list. - - - -0. Structure of PCI drivers -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -PCI drivers "discover" PCI devices in a system via pci_register_driver(). -Actually, it's the other way around. When the PCI generic code discovers -a new device, the driver with a matching "description" will be notified. -Details on this below. - -pci_register_driver() leaves most of the probing for devices to -the PCI layer and supports online insertion/removal of devices [thus -supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver]. -pci_register_driver() call requires passing in a table of function -pointers and thus dictates the high level structure of a driver. - -Once the driver knows about a PCI device and takes ownership, the -driver generally needs to perform the following initialization: - - Enable the device - Request MMIO/IOP resources - Set the DMA mask size (for both coherent and streaming DMA) - Allocate and initialize shared control data (pci_allocate_coherent()) - Access device configuration space (if needed) - Register IRQ handler (request_irq()) - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) - Enable DMA/processing engines - -When done using the device, and perhaps the module needs to be unloaded, -the driver needs to take the follow steps: - Disable the device from generating IRQs - Release the IRQ (free_irq()) - Stop all DMA activity - Release DMA buffers (both streaming and coherent) - Unregister from other subsystems (e.g. scsi or netdev) - Release MMIO/IOP resources - Disable the device - -Most of these topics are covered in the following sections. -For the rest look at LDD3 or . - -If the PCI subsystem is not configured (CONFIG_PCI is not set), most of -the PCI functions described below are defined as inline functions either -completely empty or just returning an appropriate error codes to avoid -lots of ifdefs in the drivers. - - - -1. pci_register_driver() call -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PCI device drivers call pci_register_driver() during their -initialization with a pointer to a structure describing the driver -(struct pci_driver): - - field name Description - ---------- ------------------------------------------------------ - id_table Pointer to table of device ID's the driver is - interested in. Most drivers should export this - table using MODULE_DEVICE_TABLE(pci,...). - - probe This probing function gets called (during execution - of pci_register_driver() for already existing - devices or later if a new device gets inserted) for - all PCI devices which match the ID table and are not - "owned" by the other drivers yet. This function gets - passed a "struct pci_dev *" for each device whose - entry in the ID table matches the device. The probe - function returns zero when the driver chooses to - take "ownership" of the device or an error code - (negative number) otherwise. - The probe function always gets called from process - context, so it can sleep. - - remove The remove() function gets called whenever a device - being handled by this driver is removed (either during - deregistration of the driver or when it's manually - pulled out of a hot-pluggable slot). - The remove function always gets called from process - context, so it can sleep. - - suspend Put device into low power state. - suspend_late Put device into low power state. - - resume_early Wake device from low power state. - resume Wake device from low power state. - - (Please see Documentation/power/pci.txt for descriptions - of PCI Power Management and the related functions.) - - shutdown Hook into reboot_notifier_list (kernel/sys.c). - Intended to stop any idling DMA operations. - Useful for enabling wake-on-lan (NIC) or changing - the power state of a device before reboot. - e.g. drivers/net/e100.c. - - err_handler See Documentation/PCI/pci-error-recovery.txt - - -The ID table is an array of struct pci_device_id entries ending with an -all-zero entry. Definitions with static const are generally preferred. - -Each entry consists of: - - vendor,device Vendor and device ID to match (or PCI_ANY_ID) - - subvendor, Subsystem vendor and device ID to match (or PCI_ANY_ID) - subdevice, - - class Device class, subclass, and "interface" to match. - See Appendix D of the PCI Local Bus Spec or - include/linux/pci_ids.h for a full list of classes. - Most drivers do not need to specify class/class_mask - as vendor/device is normally sufficient. - - class_mask limit which sub-fields of the class field are compared. - See drivers/scsi/sym53c8xx_2/ for example of usage. - - driver_data Data private to the driver. - Most drivers don't need to use driver_data field. - Best practice is to use driver_data as an index - into a static list of equivalent device types, - instead of using it as a pointer. - - -Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up -a pci_device_id table. - -New PCI IDs may be added to a device driver pci_ids table at runtime -as shown below: - -echo "vendor device subvendor subdevice class class_mask driver_data" > \ -/sys/bus/pci/drivers/{driver}/new_id - -All fields are passed in as hexadecimal values (no leading 0x). -The vendor and device fields are mandatory, the others are optional. Users -need pass only as many optional fields as necessary: - o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) - o class and classmask fields default to 0 - o driver_data defaults to 0UL. - -Note that driver_data must match the value used by any of the pci_device_id -entries defined in the driver. This makes the driver_data field mandatory -if all the pci_device_id entries have a non-zero driver_data value. - -Once added, the driver probe routine will be invoked for any unclaimed -PCI devices listed in its (newly updated) pci_ids list. - -When the driver exits, it just calls pci_unregister_driver() and the PCI layer -automatically calls the remove hook for all devices handled by the driver. - - -1.1 "Attributes" for driver functions/data - -Please mark the initialization and cleanup functions where appropriate -(the corresponding macros are defined in ): - - __init Initialization code. Thrown away after the driver - initializes. - __exit Exit code. Ignored for non-modular drivers. - -Tips on when/where to use the above attributes: - o The module_init()/module_exit() functions (and all - initialization functions called _only_ from these) - should be marked __init/__exit. - - o Do not mark the struct pci_driver. - - o Do NOT mark a function if you are not sure which mark to use. - Better to not mark the function than mark the function wrong. - - - -2. How to find PCI devices manually -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PCI drivers should have a really good reason for not using the -pci_register_driver() interface to search for PCI devices. -The main reason PCI devices are controlled by multiple drivers -is because one PCI device implements several different HW services. -E.g. combined serial/parallel port/floppy controller. - -A manual search may be performed using the following constructs: - -Searching by vendor and device ID: - - struct pci_dev *dev = NULL; - while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev)) - configure_device(dev); - -Searching by class ID (iterate in a similar way): - - pci_get_class(CLASS_ID, dev) - -Searching by both vendor/device and subsystem vendor/device ID: - - pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev). - -You can use the constant PCI_ANY_ID as a wildcard replacement for -VENDOR_ID or DEVICE_ID. This allows searching for any device from a -specific vendor, for example. - -These functions are hotplug-safe. They increment the reference count on -the pci_dev that they return. You must eventually (possibly at module unload) -decrement the reference count on these devices by calling pci_dev_put(). - - - -3. Device Initialization Steps -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -As noted in the introduction, most PCI drivers need the following steps -for device initialization: - - Enable the device - Request MMIO/IOP resources - Set the DMA mask size (for both coherent and streaming DMA) - Allocate and initialize shared control data (pci_allocate_coherent()) - Access device configuration space (if needed) - Register IRQ handler (request_irq()) - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) - Enable DMA/processing engines. - -The driver can access PCI config space registers at any time. -(Well, almost. When running BIST, config space can go away...but -that will just result in a PCI Bus Master Abort and config reads -will return garbage). - - -3.1 Enable the PCI device -~~~~~~~~~~~~~~~~~~~~~~~~~ -Before touching any device registers, the driver needs to enable -the PCI device by calling pci_enable_device(). This will: - o wake up the device if it was in suspended state, - o allocate I/O and memory regions of the device (if BIOS did not), - o allocate an IRQ (if BIOS did not). - -NOTE: pci_enable_device() can fail! Check the return value. - -[ OS BUG: we don't check resource allocations before enabling those - resources. The sequence would make more sense if we called - pci_request_resources() before calling pci_enable_device(). - Currently, the device drivers can't detect the bug when when two - devices have been allocated the same range. This is not a common - problem and unlikely to get fixed soon. - - This has been discussed before but not changed as of 2.6.19: - http://lkml.org/lkml/2006/3/2/194 -] - -pci_set_master() will enable DMA by setting the bus master bit -in the PCI_COMMAND register. It also fixes the latency timer value if -it's set to something bogus by the BIOS. pci_clear_master() will -disable DMA by clearing the bus master bit. - -If the PCI device can use the PCI Memory-Write-Invalidate transaction, -call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval -and also ensures that the cache line size register is set correctly. -Check the return value of pci_set_mwi() as not all architectures -or chip-sets may support Memory-Write-Invalidate. Alternatively, -if Mem-Wr-Inval would be nice to have but is not required, call -pci_try_set_mwi() to have the system do its best effort at enabling -Mem-Wr-Inval. - - -3.2 Request MMIO/IOP resources -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Memory (MMIO), and I/O port addresses should NOT be read directly -from the PCI device config space. Use the values in the pci_dev structure -as the PCI "bus address" might have been remapped to a "host physical" -address by the arch/chip-set specific kernel support. - -See Documentation/io-mapping.txt for how to access device registers -or device memory. - -The device driver needs to call pci_request_region() to verify -no other device is already using the same address resource. -Conversely, drivers should call pci_release_region() AFTER -calling pci_disable_device(). -The idea is to prevent two devices colliding on the same address range. - -[ See OS BUG comment above. Currently (2.6.19), The driver can only - determine MMIO and IO Port resource availability _after_ calling - pci_enable_device(). ] - -Generic flavors of pci_request_region() are request_mem_region() -(for MMIO ranges) and request_region() (for IO Port ranges). -Use these for address resources that are not described by "normal" PCI -BARs. - -Also see pci_request_selected_regions() below. - - -3.3 Set the DMA mask size -~~~~~~~~~~~~~~~~~~~~~~~~~ -[ If anything below doesn't make sense, please refer to - Documentation/DMA-API.txt. This section is just a reminder that - drivers need to indicate DMA capabilities of the device and is not - an authoritative source for DMA interfaces. ] - -While all drivers should explicitly indicate the DMA capability -(e.g. 32 or 64 bit) of the PCI bus master, devices with more than -32-bit bus master capability for streaming data need the driver -to "register" this capability by calling pci_set_dma_mask() with -appropriate parameters. In general this allows more efficient DMA -on systems where System RAM exists above 4G _physical_ address. - -Drivers for all PCI-X and PCIe compliant devices must call -pci_set_dma_mask() as they are 64-bit DMA devices. - -Similarly, drivers must also "register" this capability if the device -can directly address "consistent memory" in System RAM above 4G physical -address by calling pci_set_consistent_dma_mask(). -Again, this includes drivers for all PCI-X and PCIe compliant devices. -Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are -64-bit DMA capable for payload ("streaming") data but not control -("consistent") data. - - -3.4 Setup shared control data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared) -memory. See Documentation/DMA-API.txt for a full description of -the DMA APIs. This section is just a reminder that it needs to be done -before enabling DMA on the device. - - -3.5 Initialize device registers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some drivers will need specific "capability" fields programmed -or other "vendor specific" register initialized or reset. -E.g. clearing pending interrupts. - - -3.6 Register IRQ handler -~~~~~~~~~~~~~~~~~~~~~~~~ -While calling request_irq() is the last step described here, -this is often just another intermediate step to initialize a device. -This step can often be deferred until the device is opened for use. - -All interrupt handlers for IRQ lines should be registered with IRQF_SHARED -and use the devid to map IRQs to devices (remember that all PCI IRQ lines -can be shared). - -request_irq() will associate an interrupt handler and device handle -with an interrupt number. Historically interrupt numbers represent -IRQ lines which run from the PCI device to the Interrupt controller. -With MSI and MSI-X (more below) the interrupt number is a CPU "vector". - -request_irq() also enables the interrupt. Make sure the device is -quiesced and does not have any interrupts pending before registering -the interrupt handler. - -MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts" -which deliver interrupts to the CPU via a DMA write to a Local APIC. -The fundamental difference between MSI and MSI-X is how multiple -"vectors" get allocated. MSI requires contiguous blocks of vectors -while MSI-X can allocate several individual ones. - -MSI capability can be enabled by calling pci_alloc_irq_vectors() with the -PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This -causes the PCI support to program CPU vector data into the PCI device -capability registers. Many architectures, chip-sets, or BIOSes do NOT -support MSI or MSI-X and a call to pci_alloc_irq_vectors with just -the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always -specify PCI_IRQ_LEGACY as well. - -Drivers that have different interrupt handlers for MSI/MSI-X and -legacy INTx should chose the right one based on the msi_enabled -and msix_enabled flags in the pci_dev structure after calling -pci_alloc_irq_vectors. - -There are (at least) two really good reasons for using MSI: -1) MSI is an exclusive interrupt vector by definition. - This means the interrupt handler doesn't have to verify - its device caused the interrupt. - -2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed - to be visible to the host CPU(s) when the MSI is delivered. This - is important for both data coherency and avoiding stale control data. - This guarantee allows the driver to omit MMIO reads to flush - the DMA stream. - -See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples -of MSI/MSI-X usage. - - - -4. PCI device shutdown -~~~~~~~~~~~~~~~~~~~~~~~ - -When a PCI device driver is being unloaded, most of the following -steps need to be performed: - - Disable the device from generating IRQs - Release the IRQ (free_irq()) - Stop all DMA activity - Release DMA buffers (both streaming and consistent) - Unregister from other subsystems (e.g. scsi or netdev) - Disable device from responding to MMIO/IO Port addresses - Release MMIO/IO Port resource(s) - - -4.1 Stop IRQs on the device -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -How to do this is chip/device specific. If it's not done, it opens -the possibility of a "screaming interrupt" if (and only if) -the IRQ is shared with another device. - -When the shared IRQ handler is "unhooked", the remaining devices -using the same IRQ line will still need the IRQ enabled. Thus if the -"unhooked" device asserts IRQ line, the system will respond assuming -it was one of the remaining devices asserted the IRQ line. Since none -of the other devices will handle the IRQ, the system will "hang" until -it decides the IRQ isn't going to get handled and masks the IRQ (100,000 -iterations later). Once the shared IRQ is masked, the remaining devices -will stop functioning properly. Not a nice situation. - -This is another reason to use MSI or MSI-X if it's available. -MSI and MSI-X are defined to be exclusive interrupts and thus -are not susceptible to the "screaming interrupt" problem. - - -4.2 Release the IRQ -~~~~~~~~~~~~~~~~~~~ -Once the device is quiesced (no more IRQs), one can call free_irq(). -This function will return control once any pending IRQs are handled, -"unhook" the drivers IRQ handler from that IRQ, and finally release -the IRQ if no one else is using it. - - -4.3 Stop all DMA activity -~~~~~~~~~~~~~~~~~~~~~~~~~ -It's extremely important to stop all DMA operations BEFORE attempting -to deallocate DMA control data. Failure to do so can result in memory -corruption, hangs, and on some chip-sets a hard crash. - -Stopping DMA after stopping the IRQs can avoid races where the -IRQ handler might restart DMA engines. - -While this step sounds obvious and trivial, several "mature" drivers -didn't get this step right in the past. - - -4.4 Release DMA buffers -~~~~~~~~~~~~~~~~~~~~~~~ -Once DMA is stopped, clean up streaming DMA first. -I.e. unmap data buffers and return buffers to "upstream" -owners if there is one. - -Then clean up "consistent" buffers which contain the control data. - -See Documentation/DMA-API.txt for details on unmapping interfaces. - - -4.5 Unregister from other subsystems -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Most low level PCI device drivers support some other subsystem -like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your -driver isn't losing resources from that other subsystem. -If this happens, typically the symptom is an Oops (panic) when -the subsystem attempts to call into a driver that has been unloaded. - - -4.6 Disable Device from responding to MMIO/IO Port addresses -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -io_unmap() MMIO or IO Port resources and then call pci_disable_device(). -This is the symmetric opposite of pci_enable_device(). -Do not access device registers after calling pci_disable_device(). - - -4.7 Release MMIO/IO Port Resource(s) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Call pci_release_region() to mark the MMIO or IO Port range as available. -Failure to do so usually results in the inability to reload the driver. - - - -5. How to access PCI config space -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can use pci_(read|write)_config_(byte|word|dword) to access the config -space of a device represented by struct pci_dev *. All these functions return 0 -when successful or an error code (PCIBIOS_...) which can be translated to a text -string by pcibios_strerror. Most drivers expect that accesses to valid PCI -devices don't fail. - -If you don't have a struct pci_dev available, you can call -pci_bus_(read|write)_config_(byte|word|dword) to access a given device -and function on that bus. - -If you access fields in the standard portion of the config header, please -use symbolic names of locations and bits declared in . - -If you need to access Extended PCI Capability registers, just call -pci_find_capability() for the particular capability and it will find the -corresponding register block for you. - - - -6. Other interesting functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain, - bus and slot and number. If the device is - found, its reference count is increased. -pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3) -pci_find_capability() Find specified capability in device's capability - list. -pci_resource_start() Returns bus start address for a given PCI region -pci_resource_end() Returns bus end address for a given PCI region -pci_resource_len() Returns the byte length of a PCI region -pci_set_drvdata() Set private driver data pointer for a pci_dev -pci_get_drvdata() Return private driver data pointer for a pci_dev -pci_set_mwi() Enable Memory-Write-Invalidate transactions. -pci_clear_mwi() Disable Memory-Write-Invalidate transactions. - - - -7. Miscellaneous hints -~~~~~~~~~~~~~~~~~~~~~~ - -When displaying PCI device names to the user (for example when a driver wants -to tell the user what card has it found), please use pci_name(pci_dev). - -Always refer to the PCI devices by a pointer to the pci_dev structure. -All PCI layer functions use this identification and it's the only -reasonable one. Don't use bus/slot/function numbers except for very -special purposes -- on systems with multiple primary buses their semantics -can be pretty complex. - -Don't try to turn on Fast Back to Back writes in your driver. All devices -on the bus need to be capable of doing it, so this is something which needs -to be handled by platform and generic code, not individual drivers. - - - -8. Vendor and device identifications -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Do not add new device or vendor IDs to include/linux/pci_ids.h unless they -are shared across multiple drivers. You can add private definitions in -your driver if they're helpful, or just use plain hex constants. - -The device IDs are arbitrary hex numbers (vendor controlled) and normally used -only in a single location, the pci_device_id table. - -Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/. -There are mirrors of the pci.ids file at http://pciids.sourceforge.net/ -and https://github.com/pciutils/pciids. - - - -9. Obsolete functions -~~~~~~~~~~~~~~~~~~~~~ - -There are several functions which you might come across when trying to -port an old driver to the new PCI interface. They are no longer present -in the kernel as they aren't compatible with hotplug or PCI domains or -having sane locking. - -pci_find_device() Superseded by pci_get_device() -pci_find_subsys() Superseded by pci_get_subsys() -pci_find_slot() Superseded by pci_get_domain_bus_and_slot() -pci_get_slot() Superseded by pci_get_domain_bus_and_slot() - - -The alternative is the traditional PCI device driver that walks PCI -device lists. This is still possible but discouraged. - - - -10. MMIO Space and "Write Posting" -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Converting a driver from using I/O Port space to using MMIO space -often requires some additional changes. Specifically, "write posting" -needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2) -already do this. I/O Port space guarantees write transactions reach the PCI -device before the CPU can continue. Writes to MMIO space allow the CPU -to continue before the transaction reaches the PCI device. HW weenies -call this "Write Posting" because the write completion is "posted" to -the CPU before the transaction has reached its destination. - -Thus, timing sensitive code should add readl() where the CPU is -expected to wait before doing other work. The classic "bit banging" -sequence works fine for I/O Port space: - - for (i = 8; --i; val >>= 1) { - outb(val & 1, ioport_reg); /* write bit */ - udelay(10); - } - -The same sequence for MMIO space should be: - - for (i = 8; --i; val >>= 1) { - writeb(val & 1, mmio_reg); /* write bit */ - readb(safe_mmio_reg); /* flush posted write */ - udelay(10); - } - -It is important that "safe_mmio_reg" not have any side effects that -interferes with the correct operation of the device. - -Another case to watch out for is when resetting a PCI device. Use PCI -Configuration space reads to flush the writel(). This will gracefully -handle the PCI master abort on all platforms if the PCI device is -expected to not respond to a readl(). Most x86 platforms will allow -MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage -(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail"). - -- cgit From 2e6422444894685a8a3135f7b982aa026dc0f74c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:25 +0800 Subject: Documentation: PCI: convert PCIEBUS-HOWTO.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/PCIEBUS-HOWTO.txt | 198 -------------------------------- Documentation/PCI/index.rst | 1 + Documentation/PCI/picebus-howto.rst | 220 ++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+), 198 deletions(-) delete mode 100644 Documentation/PCI/PCIEBUS-HOWTO.txt create mode 100644 Documentation/PCI/picebus-howto.rst (limited to 'Documentation') diff --git a/Documentation/PCI/PCIEBUS-HOWTO.txt b/Documentation/PCI/PCIEBUS-HOWTO.txt deleted file mode 100644 index 15f0bb3b5045..000000000000 --- a/Documentation/PCI/PCIEBUS-HOWTO.txt +++ /dev/null @@ -1,198 +0,0 @@ - The PCI Express Port Bus Driver Guide HOWTO - Tom L Nguyen tom.l.nguyen@intel.com - 11/03/2004 - -1. About this guide - -This guide describes the basics of the PCI Express Port Bus driver -and provides information on how to enable the service drivers to -register/unregister with the PCI Express Port Bus Driver. - -2. Copyright 2004 Intel Corporation - -3. What is the PCI Express Port Bus Driver - -A PCI Express Port is a logical PCI-PCI Bridge structure. There -are two types of PCI Express Port: the Root Port and the Switch -Port. The Root Port originates a PCI Express link from a PCI Express -Root Complex and the Switch Port connects PCI Express links to -internal logical PCI buses. The Switch Port, which has its secondary -bus representing the switch's internal routing logic, is called the -switch's Upstream Port. The switch's Downstream Port is bridging from -switch's internal routing bus to a bus representing the downstream -PCI Express link from the PCI Express Switch. - -A PCI Express Port can provide up to four distinct functions, -referred to in this document as services, depending on its port type. -PCI Express Port's services include native hotplug support (HP), -power management event support (PME), advanced error reporting -support (AER), and virtual channel support (VC). These services may -be handled by a single complex driver or be individually distributed -and handled by corresponding service drivers. - -4. Why use the PCI Express Port Bus Driver? - -In existing Linux kernels, the Linux Device Driver Model allows a -physical device to be handled by only a single driver. The PCI -Express Port is a PCI-PCI Bridge device with multiple distinct -services. To maintain a clean and simple solution each service -may have its own software service driver. In this case several -service drivers will compete for a single PCI-PCI Bridge device. -For example, if the PCI Express Root Port native hotplug service -driver is loaded first, it claims a PCI-PCI Bridge Root Port. The -kernel therefore does not load other service drivers for that Root -Port. In other words, it is impossible to have multiple service -drivers load and run on a PCI-PCI Bridge device simultaneously -using the current driver model. - -To enable multiple service drivers running simultaneously requires -having a PCI Express Port Bus driver, which manages all populated -PCI Express Ports and distributes all provided service requests -to the corresponding service drivers as required. Some key -advantages of using the PCI Express Port Bus driver are listed below: - - - Allow multiple service drivers to run simultaneously on - a PCI-PCI Bridge Port device. - - - Allow service drivers implemented in an independent - staged approach. - - - Allow one service driver to run on multiple PCI-PCI Bridge - Port devices. - - - Manage and distribute resources of a PCI-PCI Bridge Port - device to requested service drivers. - -5. Configuring the PCI Express Port Bus Driver vs. Service Drivers - -5.1 Including the PCI Express Port Bus Driver Support into the Kernel - -Including the PCI Express Port Bus driver depends on whether the PCI -Express support is included in the kernel config. The kernel will -automatically include the PCI Express Port Bus driver as a kernel -driver when the PCI Express support is enabled in the kernel. - -5.2 Enabling Service Driver Support - -PCI device drivers are implemented based on Linux Device Driver Model. -All service drivers are PCI device drivers. As discussed above, it is -impossible to load any service driver once the kernel has loaded the -PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver -Model requires some minimal changes on existing service drivers that -imposes no impact on the functionality of existing service drivers. - -A service driver is required to use the two APIs shown below to -register its service with the PCI Express Port Bus driver (see -section 5.2.1 & 5.2.2). It is important that a service driver -initializes the pcie_port_service_driver data structure, included in -header file /include/linux/pcieport_if.h, before calling these APIs. -Failure to do so will result an identity mismatch, which prevents -the PCI Express Port Bus driver from loading a service driver. - -5.2.1 pcie_port_service_register - -int pcie_port_service_register(struct pcie_port_service_driver *new) - -This API replaces the Linux Driver Model's pci_register_driver API. A -service driver should always calls pcie_port_service_register at -module init. Note that after service driver being loaded, calls -such as pci_enable_device(dev) and pci_set_master(dev) are no longer -necessary since these calls are executed by the PCI Port Bus driver. - -5.2.2 pcie_port_service_unregister - -void pcie_port_service_unregister(struct pcie_port_service_driver *new) - -pcie_port_service_unregister replaces the Linux Driver Model's -pci_unregister_driver. It's always called by service driver when a -module exits. - -5.2.3 Sample Code - -Below is sample service driver code to initialize the port service -driver data structure. - -static struct pcie_port_service_id service_id[] = { { - .vendor = PCI_ANY_ID, - .device = PCI_ANY_ID, - .port_type = PCIE_RC_PORT, - .service_type = PCIE_PORT_SERVICE_AER, - }, { /* end: all zeroes */ } -}; - -static struct pcie_port_service_driver root_aerdrv = { - .name = (char *)device_name, - .id_table = &service_id[0], - - .probe = aerdrv_load, - .remove = aerdrv_unload, - - .suspend = aerdrv_suspend, - .resume = aerdrv_resume, -}; - -Below is a sample code for registering/unregistering a service -driver. - -static int __init aerdrv_service_init(void) -{ - int retval = 0; - - retval = pcie_port_service_register(&root_aerdrv); - if (!retval) { - /* - * FIX ME - */ - } - return retval; -} - -static void __exit aerdrv_service_exit(void) -{ - pcie_port_service_unregister(&root_aerdrv); -} - -module_init(aerdrv_service_init); -module_exit(aerdrv_service_exit); - -6. Possible Resource Conflicts - -Since all service drivers of a PCI-PCI Bridge Port device are -allowed to run simultaneously, below lists a few of possible resource -conflicts with proposed solutions. - -6.1 MSI and MSI-X Vector Resource - -Once MSI or MSI-X interrupts are enabled on a device, it stays in this -mode until they are disabled again. Since service drivers of the same -PCI-PCI Bridge port share the same physical device, if an individual -service driver enables or disables MSI/MSI-X mode it may result -unpredictable behavior. - -To avoid this situation all service drivers are not permitted to -switch interrupt mode on its device. The PCI Express Port Bus driver -is responsible for determining the interrupt mode and this should be -transparent to service drivers. Service drivers need to know only -the vector IRQ assigned to the field irq of struct pcie_device, which -is passed in when the PCI Express Port Bus driver probes each service -driver. Service drivers should use (struct pcie_device*)dev->irq to -call request_irq/free_irq. In addition, the interrupt mode is stored -in the field interrupt_mode of struct pcie_device. - -6.3 PCI Memory/IO Mapped Regions - -Service drivers for PCI Express Power Management (PME), Advanced -Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access -PCI configuration space on the PCI Express port. In all cases the -registers accessed are independent of each other. This patch assumes -that all service drivers will be well behaved and not overwrite -other service driver's configuration settings. - -6.4 PCI Config Registers - -Each service driver runs its PCI config operations on its own -capability structure except the PCI Express capability structure, in -which Root Control register and Device Control register are shared -between PME and AER. This patch assumes that all service drivers -will be well behaved and not overwrite other service driver's -configuration settings. diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 7babf43709b0..79d6d75bbf28 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -9,3 +9,4 @@ Linux PCI Bus Subsystem :numbered: pci + picebus-howto diff --git a/Documentation/PCI/picebus-howto.rst b/Documentation/PCI/picebus-howto.rst new file mode 100644 index 000000000000..f882ff62c51f --- /dev/null +++ b/Documentation/PCI/picebus-howto.rst @@ -0,0 +1,220 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================== +The PCI Express Port Bus Driver Guide HOWTO +=========================================== + +:Author: Tom L Nguyen tom.l.nguyen@intel.com 11/03/2004 +:Copyright: |copy| 2004 Intel Corporation + +About this guide +================ + +This guide describes the basics of the PCI Express Port Bus driver +and provides information on how to enable the service drivers to +register/unregister with the PCI Express Port Bus Driver. + + +What is the PCI Express Port Bus Driver +======================================= + +A PCI Express Port is a logical PCI-PCI Bridge structure. There +are two types of PCI Express Port: the Root Port and the Switch +Port. The Root Port originates a PCI Express link from a PCI Express +Root Complex and the Switch Port connects PCI Express links to +internal logical PCI buses. The Switch Port, which has its secondary +bus representing the switch's internal routing logic, is called the +switch's Upstream Port. The switch's Downstream Port is bridging from +switch's internal routing bus to a bus representing the downstream +PCI Express link from the PCI Express Switch. + +A PCI Express Port can provide up to four distinct functions, +referred to in this document as services, depending on its port type. +PCI Express Port's services include native hotplug support (HP), +power management event support (PME), advanced error reporting +support (AER), and virtual channel support (VC). These services may +be handled by a single complex driver or be individually distributed +and handled by corresponding service drivers. + +Why use the PCI Express Port Bus Driver? +======================================== + +In existing Linux kernels, the Linux Device Driver Model allows a +physical device to be handled by only a single driver. The PCI +Express Port is a PCI-PCI Bridge device with multiple distinct +services. To maintain a clean and simple solution each service +may have its own software service driver. In this case several +service drivers will compete for a single PCI-PCI Bridge device. +For example, if the PCI Express Root Port native hotplug service +driver is loaded first, it claims a PCI-PCI Bridge Root Port. The +kernel therefore does not load other service drivers for that Root +Port. In other words, it is impossible to have multiple service +drivers load and run on a PCI-PCI Bridge device simultaneously +using the current driver model. + +To enable multiple service drivers running simultaneously requires +having a PCI Express Port Bus driver, which manages all populated +PCI Express Ports and distributes all provided service requests +to the corresponding service drivers as required. Some key +advantages of using the PCI Express Port Bus driver are listed below: + + - Allow multiple service drivers to run simultaneously on + a PCI-PCI Bridge Port device. + + - Allow service drivers implemented in an independent + staged approach. + + - Allow one service driver to run on multiple PCI-PCI Bridge + Port devices. + + - Manage and distribute resources of a PCI-PCI Bridge Port + device to requested service drivers. + +Configuring the PCI Express Port Bus Driver vs. Service Drivers +=============================================================== + +Including the PCI Express Port Bus Driver Support into the Kernel +----------------------------------------------------------------- + +Including the PCI Express Port Bus driver depends on whether the PCI +Express support is included in the kernel config. The kernel will +automatically include the PCI Express Port Bus driver as a kernel +driver when the PCI Express support is enabled in the kernel. + +Enabling Service Driver Support +------------------------------- + +PCI device drivers are implemented based on Linux Device Driver Model. +All service drivers are PCI device drivers. As discussed above, it is +impossible to load any service driver once the kernel has loaded the +PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver +Model requires some minimal changes on existing service drivers that +imposes no impact on the functionality of existing service drivers. + +A service driver is required to use the two APIs shown below to +register its service with the PCI Express Port Bus driver (see +section 5.2.1 & 5.2.2). It is important that a service driver +initializes the pcie_port_service_driver data structure, included in +header file /include/linux/pcieport_if.h, before calling these APIs. +Failure to do so will result an identity mismatch, which prevents +the PCI Express Port Bus driver from loading a service driver. + +pcie_port_service_register +~~~~~~~~~~~~~~~~~~~~~~~~~~ +:: + + int pcie_port_service_register(struct pcie_port_service_driver *new) + +This API replaces the Linux Driver Model's pci_register_driver API. A +service driver should always calls pcie_port_service_register at +module init. Note that after service driver being loaded, calls +such as pci_enable_device(dev) and pci_set_master(dev) are no longer +necessary since these calls are executed by the PCI Port Bus driver. + +pcie_port_service_unregister +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:: + + void pcie_port_service_unregister(struct pcie_port_service_driver *new) + +pcie_port_service_unregister replaces the Linux Driver Model's +pci_unregister_driver. It's always called by service driver when a +module exits. + +Sample Code +~~~~~~~~~~~ + +Below is sample service driver code to initialize the port service +driver data structure. +:: + + static struct pcie_port_service_id service_id[] = { { + .vendor = PCI_ANY_ID, + .device = PCI_ANY_ID, + .port_type = PCIE_RC_PORT, + .service_type = PCIE_PORT_SERVICE_AER, + }, { /* end: all zeroes */ } + }; + + static struct pcie_port_service_driver root_aerdrv = { + .name = (char *)device_name, + .id_table = &service_id[0], + + .probe = aerdrv_load, + .remove = aerdrv_unload, + + .suspend = aerdrv_suspend, + .resume = aerdrv_resume, + }; + +Below is a sample code for registering/unregistering a service +driver. +:: + + static int __init aerdrv_service_init(void) + { + int retval = 0; + + retval = pcie_port_service_register(&root_aerdrv); + if (!retval) { + /* + * FIX ME + */ + } + return retval; + } + + static void __exit aerdrv_service_exit(void) + { + pcie_port_service_unregister(&root_aerdrv); + } + + module_init(aerdrv_service_init); + module_exit(aerdrv_service_exit); + +Possible Resource Conflicts +=========================== + +Since all service drivers of a PCI-PCI Bridge Port device are +allowed to run simultaneously, below lists a few of possible resource +conflicts with proposed solutions. + +MSI and MSI-X Vector Resource +----------------------------- + +Once MSI or MSI-X interrupts are enabled on a device, it stays in this +mode until they are disabled again. Since service drivers of the same +PCI-PCI Bridge port share the same physical device, if an individual +service driver enables or disables MSI/MSI-X mode it may result +unpredictable behavior. + +To avoid this situation all service drivers are not permitted to +switch interrupt mode on its device. The PCI Express Port Bus driver +is responsible for determining the interrupt mode and this should be +transparent to service drivers. Service drivers need to know only +the vector IRQ assigned to the field irq of struct pcie_device, which +is passed in when the PCI Express Port Bus driver probes each service +driver. Service drivers should use (struct pcie_device*)dev->irq to +call request_irq/free_irq. In addition, the interrupt mode is stored +in the field interrupt_mode of struct pcie_device. + +PCI Memory/IO Mapped Regions +---------------------------- + +Service drivers for PCI Express Power Management (PME), Advanced +Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access +PCI configuration space on the PCI Express port. In all cases the +registers accessed are independent of each other. This patch assumes +that all service drivers will be well behaved and not overwrite +other service driver's configuration settings. + +PCI Config Registers +-------------------- + +Each service driver runs its PCI config operations on its own +capability structure except the PCI Express capability structure, in +which Root Control register and Device Control register are shared +between PME and AER. This patch assumes that all service drivers +will be well behaved and not overwrite other service driver's +configuration settings. -- cgit From 4d2c729c62328d6841111d98396374476367ae83 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:26 +0800 Subject: Documentation: PCI: convert pci-iov-howto.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/index.rst | 1 + Documentation/PCI/pci-iov-howto.rst | 172 ++++++++++++++++++++++++++++++++++++ Documentation/PCI/pci-iov-howto.txt | 147 ------------------------------ 3 files changed, 173 insertions(+), 147 deletions(-) create mode 100644 Documentation/PCI/pci-iov-howto.rst delete mode 100644 Documentation/PCI/pci-iov-howto.txt (limited to 'Documentation') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 79d6d75bbf28..0d9390298c4a 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -10,3 +10,4 @@ Linux PCI Bus Subsystem pci picebus-howto + pci-iov-howto diff --git a/Documentation/PCI/pci-iov-howto.rst b/Documentation/PCI/pci-iov-howto.rst new file mode 100644 index 000000000000..b9fd003206f1 --- /dev/null +++ b/Documentation/PCI/pci-iov-howto.rst @@ -0,0 +1,172 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +==================================== +PCI Express I/O Virtualization Howto +==================================== + +:Copyright: |copy| 2009 Intel Corporation +:Authors: - Yu Zhao + - Donald Dutile + +Overview +======== + +What is SR-IOV +-------------- + +Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended +capability which makes one physical device appear as multiple virtual +devices. The physical device is referred to as Physical Function (PF) +while the virtual devices are referred to as Virtual Functions (VF). +Allocation of the VF can be dynamically controlled by the PF via +registers encapsulated in the capability. By default, this feature is +not enabled and the PF behaves as traditional PCIe device. Once it's +turned on, each VF's PCI configuration space can be accessed by its own +Bus, Device and Function Number (Routing ID). And each VF also has PCI +Memory Space, which is used to map its register set. VF device driver +operates on the register set so it can be functional and appear as a +real existing PCI device. + +User Guide +========== + +How can I enable SR-IOV capability +---------------------------------- + +Multiple methods are available for SR-IOV enablement. +In the first method, the device driver (PF driver) will control the +enabling and disabling of the capability via API provided by SR-IOV core. +If the hardware has SR-IOV capability, loading its PF driver would +enable it and all VFs associated with the PF. Some PF drivers require +a module parameter to be set to determine the number of VFs to enable. +In the second method, a write to the sysfs file sriov_numvfs will +enable and disable the VFs associated with a PCIe PF. This method +enables per-PF, VF enable/disable values versus the first method, +which applies to all PFs of the same device. Additionally, the +PCI SRIOV core support ensures that enable/disable operations are +valid to reduce duplication in multiple drivers for the same +checks, e.g., check numvfs == 0 if enabling VFs, ensure +numvfs <= totalvfs. +The second method is the recommended method for new/future VF devices. + +How can I use the Virtual Functions +----------------------------------- + +The VF is treated as hot-plugged PCI devices in the kernel, so they +should be able to work in the same way as real PCI devices. The VF +requires device driver that is same as a normal PCI device's. + +Developer Guide +=============== + +SR-IOV API +---------- + +To enable SR-IOV capability: + +(a) For the first method, in the driver:: + + int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); + +'nr_virtfn' is number of VFs to be enabled. + +(b) For the second method, from sysfs:: + + echo 'nr_virtfn' > \ + /sys/bus/pci/devices//sriov_numvfs + +To disable SR-IOV capability: + +(a) For the first method, in the driver:: + + void pci_disable_sriov(struct pci_dev *dev); + +(b) For the second method, from sysfs:: + + echo 0 > \ + /sys/bus/pci/devices//sriov_numvfs + +To enable auto probing VFs by a compatible driver on the host, run +command below before enabling SR-IOV capabilities. This is the +default behavior. +:: + + echo 1 > \ + /sys/bus/pci/devices//sriov_drivers_autoprobe + +To disable auto probing VFs by a compatible driver on the host, run +command below before enabling SR-IOV capabilities. Updating this +entry will not affect VFs which are already probed. +:: + + echo 0 > \ + /sys/bus/pci/devices//sriov_drivers_autoprobe + +Usage example +------------- + +Following piece of code illustrates the usage of the SR-IOV API. +:: + + static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id) + { + pci_enable_sriov(dev, NR_VIRTFN); + + ... + + return 0; + } + + static void dev_remove(struct pci_dev *dev) + { + pci_disable_sriov(dev); + + ... + } + + static int dev_suspend(struct pci_dev *dev, pm_message_t state) + { + ... + + return 0; + } + + static int dev_resume(struct pci_dev *dev) + { + ... + + return 0; + } + + static void dev_shutdown(struct pci_dev *dev) + { + ... + } + + static int dev_sriov_configure(struct pci_dev *dev, int numvfs) + { + if (numvfs > 0) { + ... + pci_enable_sriov(dev, numvfs); + ... + return numvfs; + } + if (numvfs == 0) { + .... + pci_disable_sriov(dev); + ... + return 0; + } + } + + static struct pci_driver dev_driver = { + .name = "SR-IOV Physical Function driver", + .id_table = dev_id_table, + .probe = dev_probe, + .remove = dev_remove, + .suspend = dev_suspend, + .resume = dev_resume, + .shutdown = dev_shutdown, + .sriov_configure = dev_sriov_configure, + }; diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt deleted file mode 100644 index d2a84151e99c..000000000000 --- a/Documentation/PCI/pci-iov-howto.txt +++ /dev/null @@ -1,147 +0,0 @@ - PCI Express I/O Virtualization Howto - Copyright (C) 2009 Intel Corporation - Yu Zhao - - Update: November 2012 - -- sysfs-based SRIOV enable-/disable-ment - Donald Dutile - -1. Overview - -1.1 What is SR-IOV - -Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended -capability which makes one physical device appear as multiple virtual -devices. The physical device is referred to as Physical Function (PF) -while the virtual devices are referred to as Virtual Functions (VF). -Allocation of the VF can be dynamically controlled by the PF via -registers encapsulated in the capability. By default, this feature is -not enabled and the PF behaves as traditional PCIe device. Once it's -turned on, each VF's PCI configuration space can be accessed by its own -Bus, Device and Function Number (Routing ID). And each VF also has PCI -Memory Space, which is used to map its register set. VF device driver -operates on the register set so it can be functional and appear as a -real existing PCI device. - -2. User Guide - -2.1 How can I enable SR-IOV capability - -Multiple methods are available for SR-IOV enablement. -In the first method, the device driver (PF driver) will control the -enabling and disabling of the capability via API provided by SR-IOV core. -If the hardware has SR-IOV capability, loading its PF driver would -enable it and all VFs associated with the PF. Some PF drivers require -a module parameter to be set to determine the number of VFs to enable. -In the second method, a write to the sysfs file sriov_numvfs will -enable and disable the VFs associated with a PCIe PF. This method -enables per-PF, VF enable/disable values versus the first method, -which applies to all PFs of the same device. Additionally, the -PCI SRIOV core support ensures that enable/disable operations are -valid to reduce duplication in multiple drivers for the same -checks, e.g., check numvfs == 0 if enabling VFs, ensure -numvfs <= totalvfs. -The second method is the recommended method for new/future VF devices. - -2.2 How can I use the Virtual Functions - -The VF is treated as hot-plugged PCI devices in the kernel, so they -should be able to work in the same way as real PCI devices. The VF -requires device driver that is same as a normal PCI device's. - -3. Developer Guide - -3.1 SR-IOV API - -To enable SR-IOV capability: -(a) For the first method, in the driver: - int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); - 'nr_virtfn' is number of VFs to be enabled. -(b) For the second method, from sysfs: - echo 'nr_virtfn' > \ - /sys/bus/pci/devices//sriov_numvfs - -To disable SR-IOV capability: -(a) For the first method, in the driver: - void pci_disable_sriov(struct pci_dev *dev); -(b) For the second method, from sysfs: - echo 0 > \ - /sys/bus/pci/devices//sriov_numvfs - -To enable auto probing VFs by a compatible driver on the host, run -command below before enabling SR-IOV capabilities. This is the -default behavior. - echo 1 > \ - /sys/bus/pci/devices//sriov_drivers_autoprobe - -To disable auto probing VFs by a compatible driver on the host, run -command below before enabling SR-IOV capabilities. Updating this -entry will not affect VFs which are already probed. - echo 0 > \ - /sys/bus/pci/devices//sriov_drivers_autoprobe - -3.2 Usage example - -Following piece of code illustrates the usage of the SR-IOV API. - -static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - pci_enable_sriov(dev, NR_VIRTFN); - - ... - - return 0; -} - -static void dev_remove(struct pci_dev *dev) -{ - pci_disable_sriov(dev); - - ... -} - -static int dev_suspend(struct pci_dev *dev, pm_message_t state) -{ - ... - - return 0; -} - -static int dev_resume(struct pci_dev *dev) -{ - ... - - return 0; -} - -static void dev_shutdown(struct pci_dev *dev) -{ - ... -} - -static int dev_sriov_configure(struct pci_dev *dev, int numvfs) -{ - if (numvfs > 0) { - ... - pci_enable_sriov(dev, numvfs); - ... - return numvfs; - } - if (numvfs == 0) { - .... - pci_disable_sriov(dev); - ... - return 0; - } -} - -static struct pci_driver dev_driver = { - .name = "SR-IOV Physical Function driver", - .id_table = dev_id_table, - .probe = dev_probe, - .remove = dev_remove, - .suspend = dev_suspend, - .resume = dev_resume, - .shutdown = dev_shutdown, - .sriov_configure = dev_sriov_configure, -}; -- cgit From 3b9bae029b60ee0fa6d6205e0debfad4482434a7 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:27 +0800 Subject: Documentation: PCI: convert MSI-HOWTO.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/MSI-HOWTO.txt | 270 ------------------------------------- Documentation/PCI/index.rst | 1 + Documentation/PCI/msi-howto.rst | 287 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 288 insertions(+), 270 deletions(-) delete mode 100644 Documentation/PCI/MSI-HOWTO.txt create mode 100644 Documentation/PCI/msi-howto.rst (limited to 'Documentation') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt deleted file mode 100644 index 618e13d5e276..000000000000 --- a/Documentation/PCI/MSI-HOWTO.txt +++ /dev/null @@ -1,270 +0,0 @@ - The MSI Driver Guide HOWTO - Tom L Nguyen tom.l.nguyen@intel.com - 10/03/2003 - Revised Feb 12, 2004 by Martine Silbermann - email: Martine.Silbermann@hp.com - Revised Jun 25, 2004 by Tom L Nguyen - Revised Jul 9, 2008 by Matthew Wilcox - Copyright 2003, 2008 Intel Corporation - -1. About this guide - -This guide describes the basics of Message Signaled Interrupts (MSIs), -the advantages of using MSI over traditional interrupt mechanisms, how -to change your driver to use MSI or MSI-X and some basic diagnostics to -try if a device doesn't support MSIs. - - -2. What are MSIs? - -A Message Signaled Interrupt is a write from the device to a special -address which causes an interrupt to be received by the CPU. - -The MSI capability was first specified in PCI 2.2 and was later enhanced -in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X -capability was also introduced with PCI 3.0. It supports more interrupts -per device than MSI and allows interrupts to be independently configured. - -Devices may support both MSI and MSI-X, but only one can be enabled at -a time. - - -3. Why use MSIs? - -There are three reasons why using MSIs can give an advantage over -traditional pin-based interrupts. - -Pin-based PCI interrupts are often shared amongst several devices. -To support this, the kernel must call each interrupt handler associated -with an interrupt, which leads to reduced performance for the system as -a whole. MSIs are never shared, so this problem cannot arise. - -When a device writes data to memory, then raises a pin-based interrupt, -it is possible that the interrupt may arrive before all the data has -arrived in memory (this becomes more likely with devices behind PCI-PCI -bridges). In order to ensure that all the data has arrived in memory, -the interrupt handler must read a register on the device which raised -the interrupt. PCI transaction ordering rules require that all the data -arrive in memory before the value may be returned from the register. -Using MSIs avoids this problem as the interrupt-generating write cannot -pass the data writes, so by the time the interrupt is raised, the driver -knows that all the data has arrived in memory. - -PCI devices can only support a single pin-based interrupt per function. -Often drivers have to query the device to find out what event has -occurred, slowing down interrupt handling for the common case. With -MSIs, a device can support more interrupts, allowing each interrupt -to be specialised to a different purpose. One possible design gives -infrequent conditions (such as errors) their own interrupt which allows -the driver to handle the normal interrupt handling path more efficiently. -Other possible designs include giving one interrupt to each packet queue -in a network card or each port in a storage controller. - - -4. How to use MSIs - -PCI devices are initialised to use pin-based interrupts. The device -driver has to set up the device to use MSI or MSI-X. Not all machines -support MSIs correctly, and for those machines, the APIs described below -will simply fail and the device will continue to use pin-based interrupts. - -4.1 Include kernel support for MSIs - -To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI -option enabled. This option is only available on some architectures, -and it may depend on some other options also being set. For example, -on x86, you must also enable X86_UP_APIC or SMP in order to see the -CONFIG_PCI_MSI option. - -4.2 Using MSI - -Most of the hard work is done for the driver in the PCI layer. The driver -simply has to request that the PCI layer set up the MSI capability for this -device. - -To automatically use MSI or MSI-X interrupt vectors, use the following -function: - - int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, - unsigned int max_vecs, unsigned int flags); - -which allocates up to max_vecs interrupt vectors for a PCI device. It -returns the number of vectors allocated or a negative error. If the device -has a requirements for a minimum number of vectors the driver can pass a -min_vecs argument set to this limit, and the PCI core will return -ENOSPC -if it can't meet the minimum number of vectors. - -The flags argument is used to specify which type of interrupt can be used -by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX). -A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for -any possible kind of interrupt. If the PCI_IRQ_AFFINITY flag is set, -pci_alloc_irq_vectors() will spread the interrupts around the available CPUs. - -To get the Linux IRQ numbers passed to request_irq() and free_irq() and the -vectors, use the following function: - - int pci_irq_vector(struct pci_dev *dev, unsigned int nr); - -Any allocated resources should be freed before removing the device using -the following function: - - void pci_free_irq_vectors(struct pci_dev *dev); - -If a device supports both MSI-X and MSI capabilities, this API will use the -MSI-X facilities in preference to the MSI facilities. MSI-X supports any -number of interrupts between 1 and 2048. In contrast, MSI is restricted to -a maximum of 32 interrupts (and must be a power of two). In addition, the -MSI interrupt vectors must be allocated consecutively, so the system might -not be able to allocate as many vectors for MSI as it could for MSI-X. On -some platforms, MSI interrupts must all be targeted at the same set of CPUs -whereas MSI-X interrupts can all be targeted at different CPUs. - -If a device supports neither MSI-X or MSI it will fall back to a single -legacy IRQ vector. - -The typical usage of MSI or MSI-X interrupts is to allocate as many vectors -as possible, likely up to the limit supported by the device. If nvec is -larger than the number supported by the device it will automatically be -capped to the supported limit, so there is no need to query the number of -vectors supported beforehand: - - nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES) - if (nvec < 0) - goto out_err; - -If a driver is unable or unwilling to deal with a variable number of MSI -interrupts it can request a particular number of interrupts by passing that -number to pci_alloc_irq_vectors() function as both 'min_vecs' and -'max_vecs' parameters: - - ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES); - if (ret < 0) - goto out_err; - -The most notorious example of the request type described above is enabling -the single MSI mode for a device. It could be done by passing two 1s as -'min_vecs' and 'max_vecs': - - ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); - if (ret < 0) - goto out_err; - -Some devices might not support using legacy line interrupts, in which case -the driver can specify that only MSI or MSI-X is acceptable: - - nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX); - if (nvec < 0) - goto out_err; - -4.3 Legacy APIs - -The following old APIs to enable and disable MSI or MSI-X interrupts should -not be used in new code: - - pci_enable_msi() /* deprecated */ - pci_disable_msi() /* deprecated */ - pci_enable_msix_range() /* deprecated */ - pci_enable_msix_exact() /* deprecated */ - pci_disable_msix() /* deprecated */ - -Additionally there are APIs to provide the number of supported MSI or MSI-X -vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these -should be avoided in favor of letting pci_alloc_irq_vectors() cap the -number of vectors. If you have a legitimate special use case for the count -of vectors we might have to revisit that decision and add a -pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. - -4.4 Considerations when using MSIs - -4.4.1 Spinlocks - -Most device drivers have a per-device spinlock which is taken in the -interrupt handler. With pin-based interrupts or a single MSI, it is not -necessary to disable interrupts (Linux guarantees the same interrupt will -not be re-entered). If a device uses multiple interrupts, the driver -must disable interrupts while the lock is held. If the device sends -a different interrupt, the driver will deadlock trying to recursively -acquire the spinlock. Such deadlocks can be avoided by using -spin_lock_irqsave() or spin_lock_irq() which disable local interrupts -and acquire the lock (see Documentation/kernel-hacking/locking.rst). - -4.5 How to tell whether MSI/MSI-X is enabled on a device - -Using 'lspci -v' (as root) may show some devices with "MSI", "Message -Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities -has an 'Enable' flag which is followed with either "+" (enabled) -or "-" (disabled). - - -5. MSI quirks - -Several PCI chipsets or devices are known not to support MSIs. -The PCI stack provides three ways to disable MSIs: - -1. globally -2. on all devices behind a specific bridge -3. on a single device - -5.1. Disabling MSIs globally - -Some host chipsets simply don't support MSIs properly. If we're -lucky, the manufacturer knows this and has indicated it in the ACPI -FADT table. In this case, Linux automatically disables MSIs. -Some boards don't include this information in the table and so we have -to detect them ourselves. The complete list of these is found near the -quirk_disable_all_msi() function in drivers/pci/quirks.c. - -If you have a board which has problems with MSIs, you can pass pci=nomsi -on the kernel command line to disable MSIs on all devices. It would be -in your best interests to report the problem to linux-pci@vger.kernel.org -including a full 'lspci -v' so we can add the quirks to the kernel. - -5.2. Disabling MSIs below a bridge - -Some PCI bridges are not able to route MSIs between busses properly. -In this case, MSIs must be disabled on all devices behind the bridge. - -Some bridges allow you to enable MSIs by changing some bits in their -PCI configuration space (especially the Hypertransport chipsets such -as the nVidia nForce and Serverworks HT2000). As with host chipsets, -Linux mostly knows about them and automatically enables MSIs if it can. -If you have a bridge unknown to Linux, you can enable -MSIs in configuration space using whatever method you know works, then -enable MSIs on that bridge by doing: - - echo 1 > /sys/bus/pci/devices/$bridge/msi_bus - -where $bridge is the PCI address of the bridge you've enabled (eg -0000:00:0e.0). - -To disable MSIs, echo 0 instead of 1. Changing this value should be -done with caution as it could break interrupt handling for all devices -below this bridge. - -Again, please notify linux-pci@vger.kernel.org of any bridges that need -special handling. - -5.3. Disabling MSIs on a single device - -Some devices are known to have faulty MSI implementations. Usually this -is handled in the individual device driver, but occasionally it's necessary -to handle this with a quirk. Some drivers have an option to disable use -of MSI. While this is a convenient workaround for the driver author, -it is not good practice, and should not be emulated. - -5.4. Finding why MSIs are disabled on a device - -From the above three sections, you can see that there are many reasons -why MSIs may not be enabled for a given device. Your first step should -be to examine your dmesg carefully to determine whether MSIs are enabled -for your machine. You should also check your .config to be sure you -have enabled CONFIG_PCI_MSI. - -Then, 'lspci -t' gives the list of bridges above a device. Reading -/sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1) -or disabled (0). If 0 is found in any of the msi_bus files belonging -to bridges between the PCI root and the device, MSIs are disabled. - -It is also worth checking the device driver to see whether it supports MSIs. -For example, it may contain calls to pci_irq_alloc_vectors() with the -PCI_IRQ_MSI or PCI_IRQ_MSIX flags. diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 0d9390298c4a..458354daac47 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -11,3 +11,4 @@ Linux PCI Bus Subsystem pci picebus-howto pci-iov-howto + msi-howto diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst new file mode 100644 index 000000000000..994cbb660ade --- /dev/null +++ b/Documentation/PCI/msi-howto.rst @@ -0,0 +1,287 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +========================== +The MSI Driver Guide HOWTO +========================== + +:Authors: Tom L Nguyen; Martine Silbermann; Matthew Wilcox + +:Copyright: 2003, 2008 Intel Corporation + +About this guide +================ + +This guide describes the basics of Message Signaled Interrupts (MSIs), +the advantages of using MSI over traditional interrupt mechanisms, how +to change your driver to use MSI or MSI-X and some basic diagnostics to +try if a device doesn't support MSIs. + + +What are MSIs? +============== + +A Message Signaled Interrupt is a write from the device to a special +address which causes an interrupt to be received by the CPU. + +The MSI capability was first specified in PCI 2.2 and was later enhanced +in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X +capability was also introduced with PCI 3.0. It supports more interrupts +per device than MSI and allows interrupts to be independently configured. + +Devices may support both MSI and MSI-X, but only one can be enabled at +a time. + + +Why use MSIs? +============= + +There are three reasons why using MSIs can give an advantage over +traditional pin-based interrupts. + +Pin-based PCI interrupts are often shared amongst several devices. +To support this, the kernel must call each interrupt handler associated +with an interrupt, which leads to reduced performance for the system as +a whole. MSIs are never shared, so this problem cannot arise. + +When a device writes data to memory, then raises a pin-based interrupt, +it is possible that the interrupt may arrive before all the data has +arrived in memory (this becomes more likely with devices behind PCI-PCI +bridges). In order to ensure that all the data has arrived in memory, +the interrupt handler must read a register on the device which raised +the interrupt. PCI transaction ordering rules require that all the data +arrive in memory before the value may be returned from the register. +Using MSIs avoids this problem as the interrupt-generating write cannot +pass the data writes, so by the time the interrupt is raised, the driver +knows that all the data has arrived in memory. + +PCI devices can only support a single pin-based interrupt per function. +Often drivers have to query the device to find out what event has +occurred, slowing down interrupt handling for the common case. With +MSIs, a device can support more interrupts, allowing each interrupt +to be specialised to a different purpose. One possible design gives +infrequent conditions (such as errors) their own interrupt which allows +the driver to handle the normal interrupt handling path more efficiently. +Other possible designs include giving one interrupt to each packet queue +in a network card or each port in a storage controller. + + +How to use MSIs +=============== + +PCI devices are initialised to use pin-based interrupts. The device +driver has to set up the device to use MSI or MSI-X. Not all machines +support MSIs correctly, and for those machines, the APIs described below +will simply fail and the device will continue to use pin-based interrupts. + +Include kernel support for MSIs +------------------------------- + +To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI +option enabled. This option is only available on some architectures, +and it may depend on some other options also being set. For example, +on x86, you must also enable X86_UP_APIC or SMP in order to see the +CONFIG_PCI_MSI option. + +Using MSI +--------- + +Most of the hard work is done for the driver in the PCI layer. The driver +simply has to request that the PCI layer set up the MSI capability for this +device. + +To automatically use MSI or MSI-X interrupt vectors, use the following +function:: + + int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); + +which allocates up to max_vecs interrupt vectors for a PCI device. It +returns the number of vectors allocated or a negative error. If the device +has a requirements for a minimum number of vectors the driver can pass a +min_vecs argument set to this limit, and the PCI core will return -ENOSPC +if it can't meet the minimum number of vectors. + +The flags argument is used to specify which type of interrupt can be used +by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX). +A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for +any possible kind of interrupt. If the PCI_IRQ_AFFINITY flag is set, +pci_alloc_irq_vectors() will spread the interrupts around the available CPUs. + +To get the Linux IRQ numbers passed to request_irq() and free_irq() and the +vectors, use the following function:: + + int pci_irq_vector(struct pci_dev *dev, unsigned int nr); + +Any allocated resources should be freed before removing the device using +the following function:: + + void pci_free_irq_vectors(struct pci_dev *dev); + +If a device supports both MSI-X and MSI capabilities, this API will use the +MSI-X facilities in preference to the MSI facilities. MSI-X supports any +number of interrupts between 1 and 2048. In contrast, MSI is restricted to +a maximum of 32 interrupts (and must be a power of two). In addition, the +MSI interrupt vectors must be allocated consecutively, so the system might +not be able to allocate as many vectors for MSI as it could for MSI-X. On +some platforms, MSI interrupts must all be targeted at the same set of CPUs +whereas MSI-X interrupts can all be targeted at different CPUs. + +If a device supports neither MSI-X or MSI it will fall back to a single +legacy IRQ vector. + +The typical usage of MSI or MSI-X interrupts is to allocate as many vectors +as possible, likely up to the limit supported by the device. If nvec is +larger than the number supported by the device it will automatically be +capped to the supported limit, so there is no need to query the number of +vectors supported beforehand:: + + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES) + if (nvec < 0) + goto out_err; + +If a driver is unable or unwilling to deal with a variable number of MSI +interrupts it can request a particular number of interrupts by passing that +number to pci_alloc_irq_vectors() function as both 'min_vecs' and +'max_vecs' parameters:: + + ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES); + if (ret < 0) + goto out_err; + +The most notorious example of the request type described above is enabling +the single MSI mode for a device. It could be done by passing two 1s as +'min_vecs' and 'max_vecs':: + + ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (ret < 0) + goto out_err; + +Some devices might not support using legacy line interrupts, in which case +the driver can specify that only MSI or MSI-X is acceptable:: + + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX); + if (nvec < 0) + goto out_err; + +Legacy APIs +----------- + +The following old APIs to enable and disable MSI or MSI-X interrupts should +not be used in new code:: + + pci_enable_msi() /* deprecated */ + pci_disable_msi() /* deprecated */ + pci_enable_msix_range() /* deprecated */ + pci_enable_msix_exact() /* deprecated */ + pci_disable_msix() /* deprecated */ + +Additionally there are APIs to provide the number of supported MSI or MSI-X +vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these +should be avoided in favor of letting pci_alloc_irq_vectors() cap the +number of vectors. If you have a legitimate special use case for the count +of vectors we might have to revisit that decision and add a +pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. + +Considerations when using MSIs +------------------------------ + +Spinlocks +~~~~~~~~~ + +Most device drivers have a per-device spinlock which is taken in the +interrupt handler. With pin-based interrupts or a single MSI, it is not +necessary to disable interrupts (Linux guarantees the same interrupt will +not be re-entered). If a device uses multiple interrupts, the driver +must disable interrupts while the lock is held. If the device sends +a different interrupt, the driver will deadlock trying to recursively +acquire the spinlock. Such deadlocks can be avoided by using +spin_lock_irqsave() or spin_lock_irq() which disable local interrupts +and acquire the lock (see Documentation/kernel-hacking/locking.rst). + +How to tell whether MSI/MSI-X is enabled on a device +---------------------------------------------------- + +Using 'lspci -v' (as root) may show some devices with "MSI", "Message +Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities +has an 'Enable' flag which is followed with either "+" (enabled) +or "-" (disabled). + + +MSI quirks +========== + +Several PCI chipsets or devices are known not to support MSIs. +The PCI stack provides three ways to disable MSIs: + +1. globally +2. on all devices behind a specific bridge +3. on a single device + +Disabling MSIs globally +----------------------- + +Some host chipsets simply don't support MSIs properly. If we're +lucky, the manufacturer knows this and has indicated it in the ACPI +FADT table. In this case, Linux automatically disables MSIs. +Some boards don't include this information in the table and so we have +to detect them ourselves. The complete list of these is found near the +quirk_disable_all_msi() function in drivers/pci/quirks.c. + +If you have a board which has problems with MSIs, you can pass pci=nomsi +on the kernel command line to disable MSIs on all devices. It would be +in your best interests to report the problem to linux-pci@vger.kernel.org +including a full 'lspci -v' so we can add the quirks to the kernel. + +Disabling MSIs below a bridge +----------------------------- + +Some PCI bridges are not able to route MSIs between busses properly. +In this case, MSIs must be disabled on all devices behind the bridge. + +Some bridges allow you to enable MSIs by changing some bits in their +PCI configuration space (especially the Hypertransport chipsets such +as the nVidia nForce and Serverworks HT2000). As with host chipsets, +Linux mostly knows about them and automatically enables MSIs if it can. +If you have a bridge unknown to Linux, you can enable +MSIs in configuration space using whatever method you know works, then +enable MSIs on that bridge by doing:: + + echo 1 > /sys/bus/pci/devices/$bridge/msi_bus + +where $bridge is the PCI address of the bridge you've enabled (eg +0000:00:0e.0). + +To disable MSIs, echo 0 instead of 1. Changing this value should be +done with caution as it could break interrupt handling for all devices +below this bridge. + +Again, please notify linux-pci@vger.kernel.org of any bridges that need +special handling. + +Disabling MSIs on a single device +--------------------------------- + +Some devices are known to have faulty MSI implementations. Usually this +is handled in the individual device driver, but occasionally it's necessary +to handle this with a quirk. Some drivers have an option to disable use +of MSI. While this is a convenient workaround for the driver author, +it is not good practice, and should not be emulated. + +Finding why MSIs are disabled on a device +----------------------------------------- + +From the above three sections, you can see that there are many reasons +why MSIs may not be enabled for a given device. Your first step should +be to examine your dmesg carefully to determine whether MSIs are enabled +for your machine. You should also check your .config to be sure you +have enabled CONFIG_PCI_MSI. + +Then, 'lspci -t' gives the list of bridges above a device. Reading +`/sys/bus/pci/devices/*/msi_bus` will tell you whether MSIs are enabled (1) +or disabled (0). If 0 is found in any of the msi_bus files belonging +to bridges between the PCI root and the device, MSIs are disabled. + +It is also worth checking the device driver to see whether it supports MSIs. +For example, it may contain calls to pci_irq_alloc_vectors() with the +PCI_IRQ_MSI or PCI_IRQ_MSIX flags. -- cgit From b66357f32fb9a68bb9f2126a894d3b9bbc4e821c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:28 +0800 Subject: Documentation: PCI: convert acpi-info.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Cc: Mauro Carvalho Chehab --- Documentation/PCI/acpi-info.rst | 192 ++++++++++++++++++++++++++++++++++++++++ Documentation/PCI/acpi-info.txt | 187 -------------------------------------- Documentation/PCI/index.rst | 1 + 3 files changed, 193 insertions(+), 187 deletions(-) create mode 100644 Documentation/PCI/acpi-info.rst delete mode 100644 Documentation/PCI/acpi-info.txt (limited to 'Documentation') diff --git a/Documentation/PCI/acpi-info.rst b/Documentation/PCI/acpi-info.rst new file mode 100644 index 000000000000..060217081c79 --- /dev/null +++ b/Documentation/PCI/acpi-info.rst @@ -0,0 +1,192 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +ACPI considerations for PCI host bridges +======================================== + +The general rule is that the ACPI namespace should describe everything the +OS might use unless there's another way for the OS to find it [1, 2]. + +For example, there's no standard hardware mechanism for enumerating PCI +host bridges, so the ACPI namespace must describe each host bridge, the +method for accessing PCI config space below it, the address space windows +the host bridge forwards to PCI (using _CRS), and the routing of legacy +INTx interrupts (using _PRT). + +PCI devices, which are below the host bridge, generally do not need to be +described via ACPI. The OS can discover them via the standard PCI +enumeration mechanism, using config accesses to discover and identify +devices and read and size their BARs. However, ACPI may describe PCI +devices if it provides power management or hotplug functionality for them +or if the device has INTx interrupts connected by platform interrupt +controllers and a _PRT is needed to describe those connections. + +ACPI resource description is done via _CRS objects of devices in the ACPI +namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read +_CRS and figure out what resource is being consumed even if it doesn't have +a driver for the device [3].  That's important because it means an old OS +can work correctly even on a system with new devices unknown to the OS. +The new devices might not do anything, but the OS can at least make sure no +resources conflict with them. + +Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for +reserving address space. The static tables are for things the OS needs to +know early in boot, before it can parse the ACPI namespace. If a new table +is defined, an old OS needs to operate correctly even though it ignores the +table. _CRS allows that because it is generic and understood by the old +OS; a static table does not. + +If the OS is expected to manage a non-discoverable device described via +ACPI, that device will have a specific _HID/_CID that tells the OS what +driver to bind to it, and the _CRS tells the OS and the driver where the +device's registers are. + +PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should +describe all the address space they consume.  This includes all the windows +they forward down to the PCI bus, as well as registers of the host bridge +itself that are not forwarded to PCI.  The host bridge registers include +things like secondary/subordinate bus registers that determine the bus +range below the bridge, window registers that describe the apertures, etc. +These are all device-specific, non-architected things, so the only way a +PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain +the device-specific details.  The host bridge registers also include ECAM +space, since it is consumed by the host bridge. + +ACPI defines a Consumer/Producer bit to distinguish the bridge registers +("Consumer") from the bridge apertures ("Producer") [4, 5], but early +BIOSes didn't use that bit correctly. The result is that the current ACPI +spec defines Consumer/Producer only for the Extended Address Space +descriptors; the bit should be ignored in the older QWord/DWord/Word +Address Space descriptors. Consequently, OSes have to assume all +QWord/DWord/Word descriptors are windows. + +Prior to the addition of Extended Address Space descriptors, the failure of +Consumer/Producer meant there was no way to describe bridge registers in +the PNP0A03/PNP0A08 device itself. The workaround was to describe the +bridge registers (including ECAM space) in PNP0C02 catch-all devices [6]. +With the exception of ECAM, the bridge register space is device-specific +anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to +know about it.   + +New architectures should be able to use "Consumer" Extended Address Space +descriptors in the PNP0A03 device for bridge registers, including ECAM, +although a strict interpretation of [6] might prohibit this. Old x86 and +ia64 kernels assume all address space descriptors, including "Consumer" +Extended Address Space ones, are windows, so it would not be safe to +describe bridge registers this way on those architectures. + +PNP0C02 "motherboard" devices are basically a catch-all.  There's no +programming model for them other than "don't use these resources for +anything else."  So a PNP0C02 _CRS should claim any address space that is +(1) not claimed by _CRS under any other device object in the ACPI namespace +and (2) should not be assigned by the OS to something else. + +The PCIe spec requires the Enhanced Configuration Access Method (ECAM) +unless there's a standard firmware interface for config access, e.g., the +ia64 SAL interface [7]. A host bridge consumes ECAM memory address space +and converts memory accesses into PCI configuration accesses. The spec +defines the ECAM address space layout and functionality; only the base of +the address space is device-specific. An ACPI OS learns the base address +from either the static MCFG table or a _CBA method in the PNP0A03 device. + +The MCFG table must describe the ECAM space of non-hot pluggable host +bridges [8]. Since MCFG is a static table and can't be updated by hotplug, +a _CBA method in the PNP0A03 device describes the ECAM space of a +hot-pluggable host bridge [9]. Note that for both MCFG and _CBA, the base +address always corresponds to bus 0, even if the bus range below the bridge +(which is reported via _CRS) doesn't start at 0. + + +[1] ACPI 6.2, sec 6.1: + For any device that is on a non-enumerable type of bus (for example, an + ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI + system firmware must supply an _HID object ... for each device to + enable OSPM to do that. + +[2] ACPI 6.2, sec 3.7: + The OS enumerates motherboard devices simply by reading through the + ACPI Namespace looking for devices with hardware IDs. + + Each device enumerated by ACPI includes ACPI-defined objects in the + ACPI Namespace that report the hardware resources the device could + occupy [_PRS], an object that reports the resources that are currently + used by the device [_CRS], and objects for configuring those resources + [_SRS]. The information is used by the Plug and Play OS (OSPM) to + configure the devices. + +[3] ACPI 6.2, sec 6.2: + OSPM uses device configuration objects to configure hardware resources + for devices enumerated via ACPI. Device configuration objects provide + information about current and possible resource requirements, the + relationship between shared resources, and methods for configuring + hardware resources. + + When OSPM enumerates a device, it calls _PRS to determine the resource + requirements of the device. It may also call _CRS to find the current + resource settings for the device. Using this information, the Plug and + Play system determines what resources the device should consume and + sets those resources by calling the device’s _SRS control method. + + In ACPI, devices can consume resources (for example, legacy keyboards), + provide resources (for example, a proprietary PCI bridge), or do both. + Unless otherwise specified, resources for a device are assumed to be + taken from the nearest matching resource above the device in the device + hierarchy. + +[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4: + QWord/DWord/Word Address Space Descriptor (.1, .2, .3) + General Flags: Bit [0] Ignored + + Extended Address Space Descriptor (.4) + General Flags: Bit [0] Consumer/Producer: + + * 1 – This device consumes this resource + * 0 – This device produces and consumes this resource + +[5] ACPI 6.2, sec 19.6.43: + ResourceUsage specifies whether the Memory range is consumed by + this device (ResourceConsumer) or passed on to child devices + (ResourceProducer). If nothing is specified, then + ResourceConsumer is assumed. + +[6] PCI Firmware 3.2, sec 4.1.2: + If the operating system does not natively comprehend reserving the + MMCFG region, the MMCFG region must be reserved by firmware. The + address range reported in the MCFG table or by _CBA method (see Section + 4.1.3) must be reserved by declaring a motherboard resource. For most + systems, the motherboard resource would appear at the root of the ACPI + namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and + the resources in this case should not be claimed in the root PCI bus’s + _CRS. The resources can optionally be returned in Int15 E820 or + EFIGetMemoryMap as reserved memory but must always be reported through + ACPI as a motherboard resource. + +[7] PCI Express 4.0, sec 7.2.2: + For systems that are PC-compatible, or that do not implement a + processor-architecture-specific firmware interface standard that allows + access to the Configuration Space, the ECAM is required as defined in + this section. + +[8] PCI Firmware 3.2, sec 4.1.2: + The MCFG table is an ACPI table that is used to communicate the base + addresses corresponding to the non-hot removable PCI Segment Groups + range within a PCI Segment Group available to the operating system at + boot. This is required for the PC-compatible systems. + + The MCFG table is only used to communicate the base addresses + corresponding to the PCI Segment Groups available to the system at + boot. + +[9] PCI Firmware 3.2, sec 4.1.3: + The _CBA (Memory mapped Configuration Base Address) control method is + an optional ACPI object that returns the 64-bit memory mapped + configuration base address for the hot plug capable host bridge. The + base address returned by _CBA is processor-relative address. The _CBA + control method evaluates to an Integer. + + This control method appears under a host bridge object. When the _CBA + method appears under an active host bridge object, the operating system + evaluates this structure to identify the memory mapped configuration + base address corresponding to the PCI Segment Group for the bus number + range specified in _CRS method. An ACPI name space object that contains + the _CBA method must also contain a corresponding _SEG method. diff --git a/Documentation/PCI/acpi-info.txt b/Documentation/PCI/acpi-info.txt deleted file mode 100644 index 3ffa3b03970e..000000000000 --- a/Documentation/PCI/acpi-info.txt +++ /dev/null @@ -1,187 +0,0 @@ - ACPI considerations for PCI host bridges - -The general rule is that the ACPI namespace should describe everything the -OS might use unless there's another way for the OS to find it [1, 2]. - -For example, there's no standard hardware mechanism for enumerating PCI -host bridges, so the ACPI namespace must describe each host bridge, the -method for accessing PCI config space below it, the address space windows -the host bridge forwards to PCI (using _CRS), and the routing of legacy -INTx interrupts (using _PRT). - -PCI devices, which are below the host bridge, generally do not need to be -described via ACPI. The OS can discover them via the standard PCI -enumeration mechanism, using config accesses to discover and identify -devices and read and size their BARs. However, ACPI may describe PCI -devices if it provides power management or hotplug functionality for them -or if the device has INTx interrupts connected by platform interrupt -controllers and a _PRT is needed to describe those connections. - -ACPI resource description is done via _CRS objects of devices in the ACPI -namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read -_CRS and figure out what resource is being consumed even if it doesn't have -a driver for the device [3].  That's important because it means an old OS -can work correctly even on a system with new devices unknown to the OS. -The new devices might not do anything, but the OS can at least make sure no -resources conflict with them. - -Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for -reserving address space. The static tables are for things the OS needs to -know early in boot, before it can parse the ACPI namespace. If a new table -is defined, an old OS needs to operate correctly even though it ignores the -table. _CRS allows that because it is generic and understood by the old -OS; a static table does not. - -If the OS is expected to manage a non-discoverable device described via -ACPI, that device will have a specific _HID/_CID that tells the OS what -driver to bind to it, and the _CRS tells the OS and the driver where the -device's registers are. - -PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should -describe all the address space they consume.  This includes all the windows -they forward down to the PCI bus, as well as registers of the host bridge -itself that are not forwarded to PCI.  The host bridge registers include -things like secondary/subordinate bus registers that determine the bus -range below the bridge, window registers that describe the apertures, etc. -These are all device-specific, non-architected things, so the only way a -PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain -the device-specific details.  The host bridge registers also include ECAM -space, since it is consumed by the host bridge. - -ACPI defines a Consumer/Producer bit to distinguish the bridge registers -("Consumer") from the bridge apertures ("Producer") [4, 5], but early -BIOSes didn't use that bit correctly. The result is that the current ACPI -spec defines Consumer/Producer only for the Extended Address Space -descriptors; the bit should be ignored in the older QWord/DWord/Word -Address Space descriptors. Consequently, OSes have to assume all -QWord/DWord/Word descriptors are windows. - -Prior to the addition of Extended Address Space descriptors, the failure of -Consumer/Producer meant there was no way to describe bridge registers in -the PNP0A03/PNP0A08 device itself. The workaround was to describe the -bridge registers (including ECAM space) in PNP0C02 catch-all devices [6]. -With the exception of ECAM, the bridge register space is device-specific -anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to -know about it.   - -New architectures should be able to use "Consumer" Extended Address Space -descriptors in the PNP0A03 device for bridge registers, including ECAM, -although a strict interpretation of [6] might prohibit this. Old x86 and -ia64 kernels assume all address space descriptors, including "Consumer" -Extended Address Space ones, are windows, so it would not be safe to -describe bridge registers this way on those architectures. - -PNP0C02 "motherboard" devices are basically a catch-all.  There's no -programming model for them other than "don't use these resources for -anything else."  So a PNP0C02 _CRS should claim any address space that is -(1) not claimed by _CRS under any other device object in the ACPI namespace -and (2) should not be assigned by the OS to something else. - -The PCIe spec requires the Enhanced Configuration Access Method (ECAM) -unless there's a standard firmware interface for config access, e.g., the -ia64 SAL interface [7]. A host bridge consumes ECAM memory address space -and converts memory accesses into PCI configuration accesses. The spec -defines the ECAM address space layout and functionality; only the base of -the address space is device-specific. An ACPI OS learns the base address -from either the static MCFG table or a _CBA method in the PNP0A03 device. - -The MCFG table must describe the ECAM space of non-hot pluggable host -bridges [8]. Since MCFG is a static table and can't be updated by hotplug, -a _CBA method in the PNP0A03 device describes the ECAM space of a -hot-pluggable host bridge [9]. Note that for both MCFG and _CBA, the base -address always corresponds to bus 0, even if the bus range below the bridge -(which is reported via _CRS) doesn't start at 0. - - -[1] ACPI 6.2, sec 6.1: - For any device that is on a non-enumerable type of bus (for example, an - ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI - system firmware must supply an _HID object ... for each device to - enable OSPM to do that. - -[2] ACPI 6.2, sec 3.7: - The OS enumerates motherboard devices simply by reading through the - ACPI Namespace looking for devices with hardware IDs. - - Each device enumerated by ACPI includes ACPI-defined objects in the - ACPI Namespace that report the hardware resources the device could - occupy [_PRS], an object that reports the resources that are currently - used by the device [_CRS], and objects for configuring those resources - [_SRS]. The information is used by the Plug and Play OS (OSPM) to - configure the devices. - -[3] ACPI 6.2, sec 6.2: - OSPM uses device configuration objects to configure hardware resources - for devices enumerated via ACPI. Device configuration objects provide - information about current and possible resource requirements, the - relationship between shared resources, and methods for configuring - hardware resources. - - When OSPM enumerates a device, it calls _PRS to determine the resource - requirements of the device. It may also call _CRS to find the current - resource settings for the device. Using this information, the Plug and - Play system determines what resources the device should consume and - sets those resources by calling the device’s _SRS control method. - - In ACPI, devices can consume resources (for example, legacy keyboards), - provide resources (for example, a proprietary PCI bridge), or do both. - Unless otherwise specified, resources for a device are assumed to be - taken from the nearest matching resource above the device in the device - hierarchy. - -[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4: - QWord/DWord/Word Address Space Descriptor (.1, .2, .3) - General Flags: Bit [0] Ignored - - Extended Address Space Descriptor (.4) - General Flags: Bit [0] Consumer/Producer: - 1–This device consumes this resource - 0–This device produces and consumes this resource - -[5] ACPI 6.2, sec 19.6.43: - ResourceUsage specifies whether the Memory range is consumed by - this device (ResourceConsumer) or passed on to child devices - (ResourceProducer). If nothing is specified, then - ResourceConsumer is assumed. - -[6] PCI Firmware 3.2, sec 4.1.2: - If the operating system does not natively comprehend reserving the - MMCFG region, the MMCFG region must be reserved by firmware. The - address range reported in the MCFG table or by _CBA method (see Section - 4.1.3) must be reserved by declaring a motherboard resource. For most - systems, the motherboard resource would appear at the root of the ACPI - namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and - the resources in this case should not be claimed in the root PCI bus’s - _CRS. The resources can optionally be returned in Int15 E820 or - EFIGetMemoryMap as reserved memory but must always be reported through - ACPI as a motherboard resource. - -[7] PCI Express 4.0, sec 7.2.2: - For systems that are PC-compatible, or that do not implement a - processor-architecture-specific firmware interface standard that allows - access to the Configuration Space, the ECAM is required as defined in - this section. - -[8] PCI Firmware 3.2, sec 4.1.2: - The MCFG table is an ACPI table that is used to communicate the base - addresses corresponding to the non-hot removable PCI Segment Groups - range within a PCI Segment Group available to the operating system at - boot. This is required for the PC-compatible systems. - - The MCFG table is only used to communicate the base addresses - corresponding to the PCI Segment Groups available to the system at - boot. - -[9] PCI Firmware 3.2, sec 4.1.3: - The _CBA (Memory mapped Configuration Base Address) control method is - an optional ACPI object that returns the 64-bit memory mapped - configuration base address for the hot plug capable host bridge. The - base address returned by _CBA is processor-relative address. The _CBA - control method evaluates to an Integer. - - This control method appears under a host bridge object. When the _CBA - method appears under an active host bridge object, the operating system - evaluates this structure to identify the memory mapped configuration - base address corresponding to the PCI Segment Group for the bus number - range specified in _CRS method. An ACPI name space object that contains - the _CBA method must also contain a corresponding _SEG method. diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 458354daac47..6f573f3df993 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -12,3 +12,4 @@ Linux PCI Bus Subsystem picebus-howto pci-iov-howto msi-howto + acpi-info -- cgit From 8a01fa64348aaaf54b3eef9728bfe2654e7bdd88 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:29 +0800 Subject: Documentation: PCI: convert pci-error-recovery.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/index.rst | 1 + Documentation/PCI/pci-error-recovery.rst | 424 +++++++++++++++++++++++++++++++ Documentation/PCI/pci-error-recovery.txt | 413 ------------------------------ 3 files changed, 425 insertions(+), 413 deletions(-) create mode 100644 Documentation/PCI/pci-error-recovery.rst delete mode 100644 Documentation/PCI/pci-error-recovery.txt (limited to 'Documentation') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 6f573f3df993..92e62d0fc9e6 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -13,3 +13,4 @@ Linux PCI Bus Subsystem pci-iov-howto msi-howto acpi-info + pci-error-recovery diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst new file mode 100644 index 000000000000..83db42092935 --- /dev/null +++ b/Documentation/PCI/pci-error-recovery.rst @@ -0,0 +1,424 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +PCI Error Recovery +================== + + +:Authors: - Linas Vepstas + - Richard Lary + - Mike Mason + + +Many PCI bus controllers are able to detect a variety of hardware +PCI errors on the bus, such as parity errors on the data and address +buses, as well as SERR and PERR errors. Some of the more advanced +chipsets are able to deal with these errors; these include PCI-E chipsets, +and the PCI-host bridges found on IBM Power4, Power5 and Power6-based +pSeries boxes. A typical action taken is to disconnect the affected device, +halting all I/O to it. The goal of a disconnection is to avoid system +corruption; for example, to halt system memory corruption due to DMA's +to "wild" addresses. Typically, a reconnection mechanism is also +offered, so that the affected PCI device(s) are reset and put back +into working condition. The reset phase requires coordination +between the affected device drivers and the PCI controller chip. +This document describes a generic API for notifying device drivers +of a bus disconnection, and then performing error recovery. +This API is currently implemented in the 2.6.16 and later kernels. + +Reporting and recovery is performed in several steps. First, when +a PCI hardware error has resulted in a bus disconnect, that event +is reported as soon as possible to all affected device drivers, +including multiple instances of a device driver on multi-function +cards. This allows device drivers to avoid deadlocking in spinloops, +waiting for some i/o-space register to change, when it never will. +It also gives the drivers a chance to defer incoming I/O as +needed. + +Next, recovery is performed in several stages. Most of the complexity +is forced by the need to handle multi-function devices, that is, +devices that have multiple device drivers associated with them. +In the first stage, each driver is allowed to indicate what type +of reset it desires, the choices being a simple re-enabling of I/O +or requesting a slot reset. + +If any driver requests a slot reset, that is what will be done. + +After a reset and/or a re-enabling of I/O, all drivers are +again notified, so that they may then perform any device setup/config +that may be required. After these have all completed, a final +"resume normal operations" event is sent out. + +The biggest reason for choosing a kernel-based implementation rather +than a user-space implementation was the need to deal with bus +disconnects of PCI devices attached to storage media, and, in particular, +disconnects from devices holding the root file system. If the root +file system is disconnected, a user-space mechanism would have to go +through a large number of contortions to complete recovery. Almost all +of the current Linux file systems are not tolerant of disconnection +from/reconnection to their underlying block device. By contrast, +bus errors are easy to manage in the device driver. Indeed, most +device drivers already handle very similar recovery procedures; +for example, the SCSI-generic layer already provides significant +mechanisms for dealing with SCSI bus errors and SCSI bus resets. + + +Detailed Design +=============== + +Design and implementation details below, based on a chain of +public email discussions with Ben Herrenschmidt, circa 5 April 2005. + +The error recovery API support is exposed to the driver in the form of +a structure of function pointers pointed to by a new field in struct +pci_driver. A driver that fails to provide the structure is "non-aware", +and the actual recovery steps taken are platform dependent. The +arch/powerpc implementation will simulate a PCI hotplug remove/add. + +This structure has the form:: + + struct pci_error_handlers + { + int (*error_detected)(struct pci_dev *dev, enum pci_channel_state); + int (*mmio_enabled)(struct pci_dev *dev); + int (*slot_reset)(struct pci_dev *dev); + void (*resume)(struct pci_dev *dev); + }; + +The possible channel states are:: + + enum pci_channel_state { + pci_channel_io_normal, /* I/O channel is in normal state */ + pci_channel_io_frozen, /* I/O to channel is blocked */ + pci_channel_io_perm_failure, /* PCI card is dead */ + }; + +Possible return values are:: + + enum pci_ers_result { + PCI_ERS_RESULT_NONE, /* no result/none/not supported in device driver */ + PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */ + PCI_ERS_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */ + PCI_ERS_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */ + PCI_ERS_RESULT_RECOVERED, /* Device driver is fully recovered and operational */ + }; + +A driver does not have to implement all of these callbacks; however, +if it implements any, it must implement error_detected(). If a callback +is not implemented, the corresponding feature is considered unsupported. +For example, if mmio_enabled() and resume() aren't there, then it +is assumed that the driver is not doing any direct recovery and requires +a slot reset. Typically a driver will want to know about +a slot_reset(). + +The actual steps taken by a platform to recover from a PCI error +event will be platform-dependent, but will follow the general +sequence described below. + +STEP 0: Error Event +------------------- +A PCI bus error is detected by the PCI hardware. On powerpc, the slot +is isolated, in that all I/O is blocked: all reads return 0xffffffff, +all writes are ignored. + + +STEP 1: Notification +-------------------- +Platform calls the error_detected() callback on every instance of +every driver affected by the error. + +At this point, the device might not be accessible anymore, depending on +the platform (the slot will be isolated on powerpc). The driver may +already have "noticed" the error because of a failing I/O, but this +is the proper "synchronization point", that is, it gives the driver +a chance to cleanup, waiting for pending stuff (timers, whatever, etc...) +to complete; it can take semaphores, schedule, etc... everything but +touch the device. Within this function and after it returns, the driver +shouldn't do any new IOs. Called in task context. This is sort of a +"quiesce" point. See note about interrupts at the end of this doc. + +All drivers participating in this system must implement this call. +The driver must return one of the following result codes: + + - PCI_ERS_RESULT_CAN_RECOVER + Driver returns this if it thinks it might be able to recover + the HW by just banging IOs or if it wants to be given + a chance to extract some diagnostic information (see + mmio_enable, below). + - PCI_ERS_RESULT_NEED_RESET + Driver returns this if it can't recover without a + slot reset. + - PCI_ERS_RESULT_DISCONNECT + Driver returns this if it doesn't want to recover at all. + +The next step taken will depend on the result codes returned by the +drivers. + +If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER, +then the platform should re-enable IOs on the slot (or do nothing in +particular, if the platform doesn't isolate slots), and recovery +proceeds to STEP 2 (MMIO Enable). + +If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET), +then recovery proceeds to STEP 4 (Slot Reset). + +If the platform is unable to recover the slot, the next step +is STEP 6 (Permanent Failure). + +.. note:: + + The current powerpc implementation assumes that a device driver will + *not* schedule or semaphore in this routine; the current powerpc + implementation uses one kernel thread to notify all devices; + thus, if one device sleeps/schedules, all devices are affected. + Doing better requires complex multi-threaded logic in the error + recovery implementation (e.g. waiting for all notification threads + to "join" before proceeding with recovery.) This seems excessively + complex and not worth implementing. + + The current powerpc implementation doesn't much care if the device + attempts I/O at this point, or not. I/O's will fail, returning + a value of 0xff on read, and writes will be dropped. If more than + EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH + assumes that the device driver has gone into an infinite loop + and prints an error to syslog. A reboot is then required to + get the device working again. + +STEP 2: MMIO Enabled +-------------------- +The platform re-enables MMIO to the device (but typically not the +DMA), and then calls the mmio_enabled() callback on all affected +device drivers. + +This is the "early recovery" call. IOs are allowed again, but DMA is +not, with some restrictions. This is NOT a callback for the driver to +start operations again, only to peek/poke at the device, extract diagnostic +information, if any, and eventually do things like trigger a device local +reset or some such, but not restart operations. This callback is made if +all drivers on a segment agree that they can try to recover and if no automatic +link reset was performed by the HW. If the platform can't just re-enable IOs +without a slot reset or a link reset, it will not call this callback, and +instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset) + +.. note:: + + The following is proposed; no platform implements this yet: + Proposal: All I/O's should be done _synchronously_ from within + this callback, errors triggered by them will be returned via + the normal pci_check_whatever() API, no new error_detected() + callback will be issued due to an error happening here. However, + such an error might cause IOs to be re-blocked for the whole + segment, and thus invalidate the recovery that other devices + on the same segment might have done, forcing the whole segment + into one of the next states, that is, link reset or slot reset. + +The driver should return one of the following result codes: + - PCI_ERS_RESULT_RECOVERED + Driver returns this if it thinks the device is fully + functional and thinks it is ready to start + normal driver operations again. There is no + guarantee that the driver will actually be + allowed to proceed, as another driver on the + same segment might have failed and thus triggered a + slot reset on platforms that support it. + + - PCI_ERS_RESULT_NEED_RESET + Driver returns this if it thinks the device is not + recoverable in its current state and it needs a slot + reset to proceed. + + - PCI_ERS_RESULT_DISCONNECT + Same as above. Total failure, no recovery even after + reset driver dead. (To be defined more precisely) + +The next step taken depends on the results returned by the drivers. +If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform +proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations). + +If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform +proceeds to STEP 4 (Slot Reset) + +STEP 3: Link Reset +------------------ +The platform resets the link. This is a PCI-Express specific step +and is done whenever a fatal error has been detected that can be +"solved" by resetting the link. + +STEP 4: Slot Reset +------------------ + +In response to a return value of PCI_ERS_RESULT_NEED_RESET, the +the platform will perform a slot reset on the requesting PCI device(s). +The actual steps taken by a platform to perform a slot reset +will be platform-dependent. Upon completion of slot reset, the +platform will call the device slot_reset() callback. + +Powerpc platforms implement two levels of slot reset: +soft reset(default) and fundamental(optional) reset. + +Powerpc soft reset consists of asserting the adapter #RST line and then +restoring the PCI BAR's and PCI configuration header to a state +that is equivalent to what it would be after a fresh system +power-on followed by power-on BIOS/system firmware initialization. +Soft reset is also known as hot-reset. + +Powerpc fundamental reset is supported by PCI Express cards only +and results in device's state machines, hardware logic, port states and +configuration registers to initialize to their default conditions. + +For most PCI devices, a soft reset will be sufficient for recovery. +Optional fundamental reset is provided to support a limited number +of PCI Express devices for which a soft reset is not sufficient +for recovery. + +If the platform supports PCI hotplug, then the reset might be +performed by toggling the slot electrical power off/on. + +It is important for the platform to restore the PCI config space +to the "fresh poweron" state, rather than the "last state". After +a slot reset, the device driver will almost always use its standard +device initialization routines, and an unusual config space setup +may result in hung devices, kernel panics, or silent data corruption. + +This call gives drivers the chance to re-initialize the hardware +(re-download firmware, etc.). At this point, the driver may assume +that the card is in a fresh state and is fully functional. The slot +is unfrozen and the driver has full access to PCI config space, +memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X) +will also be available. + +Drivers should not restart normal I/O processing operations +at this point. If all device drivers report success on this +callback, the platform will call resume() to complete the sequence, +and let the driver restart normal I/O processing. + +A driver can still return a critical failure for this function if +it can't get the device operational after reset. If the platform +previously tried a soft reset, it might now try a hard reset (power +cycle) and then call slot_reset() again. It the device still can't +be recovered, there is nothing more that can be done; the platform +will typically report a "permanent failure" in such a case. The +device will be considered "dead" in this case. + +Drivers for multi-function cards will need to coordinate among +themselves as to which driver instance will perform any "one-shot" +or global device initialization. For example, the Symbios sym53cxx2 +driver performs device init only from PCI function 0:: + + + if (PCI_FUNC(pdev->devfn) == 0) + + sym_reset_scsi_bus(np, 0); + +Result codes: + - PCI_ERS_RESULT_DISCONNECT + Same as above. + +Drivers for PCI Express cards that require a fundamental reset must +set the needs_freset bit in the pci_dev structure in their probe function. +For example, the QLogic qla2xxx driver sets the needs_freset bit for certain +PCI card types:: + + + /* Set EEH reset type to fundamental if required by hba */ + + if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha)) + + pdev->needs_freset = 1; + + + +Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent +Failure). + +.. note:: + + The current powerpc implementation does not try a power-cycle + reset if the driver returned PCI_ERS_RESULT_DISCONNECT. + However, it probably should. + + +STEP 5: Resume Operations +------------------------- +The platform will call the resume() callback on all affected device +drivers if all drivers on the segment have returned +PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks. +The goal of this callback is to tell the driver to restart activity, +that everything is back and running. This callback does not return +a result code. + +At this point, if a new error happens, the platform will restart +a new error recovery sequence. + +STEP 6: Permanent Failure +------------------------- +A "permanent failure" has occurred, and the platform cannot recover +the device. The platform will call error_detected() with a +pci_channel_state value of pci_channel_io_perm_failure. + +The device driver should, at this point, assume the worst. It should +cancel all pending I/O, refuse all new I/O, returning -EIO to +higher layers. The device driver should then clean up all of its +memory and remove itself from kernel operations, much as it would +during system shutdown. + +The platform will typically notify the system operator of the +permanent failure in some way. If the device is hotplug-capable, +the operator will probably want to remove and replace the device. +Note, however, not all failures are truly "permanent". Some are +caused by over-heating, some by a poorly seated card. Many +PCI error events are caused by software bugs, e.g. DMA's to +wild addresses or bogus split transactions due to programming +errors. See the discussion in powerpc/eeh-pci-error-recovery.txt +for additional detail on real-life experience of the causes of +software errors. + + +Conclusion; General Remarks +--------------------------- +The way the callbacks are called is platform policy. A platform with +no slot reset capability may want to just "ignore" drivers that can't +recover (disconnect them) and try to let other cards on the same segment +recover. Keep in mind that in most real life cases, though, there will +be only one driver per segment. + +Now, a note about interrupts. If you get an interrupt and your +device is dead or has been isolated, there is a problem :) +The current policy is to turn this into a platform policy. +That is, the recovery API only requires that: + + - There is no guarantee that interrupt delivery can proceed from any + device on the segment starting from the error detection and until the + slot_reset callback is called, at which point interrupts are expected + to be fully operational. + + - There is no guarantee that interrupt delivery is stopped, that is, + a driver that gets an interrupt after detecting an error, or that detects + an error within the interrupt handler such that it prevents proper + ack'ing of the interrupt (and thus removal of the source) should just + return IRQ_NOTHANDLED. It's up to the platform to deal with that + condition, typically by masking the IRQ source during the duration of + the error handling. It is expected that the platform "knows" which + interrupts are routed to error-management capable slots and can deal + with temporarily disabling that IRQ number during error processing (this + isn't terribly complex). That means some IRQ latency for other devices + sharing the interrupt, but there is simply no other way. High end + platforms aren't supposed to share interrupts between many devices + anyway :) + +.. note:: + + Implementation details for the powerpc platform are discussed in + the file Documentation/powerpc/eeh-pci-error-recovery.txt + + As of this writing, there is a growing list of device drivers with + patches implementing error recovery. Not all of these patches are in + mainline yet. These may be used as "examples": + + - drivers/scsi/ipr + - drivers/scsi/sym53c8xx_2 + - drivers/scsi/qla2xxx + - drivers/scsi/lpfc + - drivers/next/bnx2.c + - drivers/next/e100.c + - drivers/net/e1000 + - drivers/net/e1000e + - drivers/net/ixgb + - drivers/net/ixgbe + - drivers/net/cxgb3 + - drivers/net/s2io.c + - drivers/net/qlge diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt deleted file mode 100644 index 0b6bb3ef449e..000000000000 --- a/Documentation/PCI/pci-error-recovery.txt +++ /dev/null @@ -1,413 +0,0 @@ - - PCI Error Recovery - ------------------ - February 2, 2006 - - Current document maintainer: - Linas Vepstas - updated by Richard Lary - and Mike Mason on 27-Jul-2009 - - -Many PCI bus controllers are able to detect a variety of hardware -PCI errors on the bus, such as parity errors on the data and address -buses, as well as SERR and PERR errors. Some of the more advanced -chipsets are able to deal with these errors; these include PCI-E chipsets, -and the PCI-host bridges found on IBM Power4, Power5 and Power6-based -pSeries boxes. A typical action taken is to disconnect the affected device, -halting all I/O to it. The goal of a disconnection is to avoid system -corruption; for example, to halt system memory corruption due to DMA's -to "wild" addresses. Typically, a reconnection mechanism is also -offered, so that the affected PCI device(s) are reset and put back -into working condition. The reset phase requires coordination -between the affected device drivers and the PCI controller chip. -This document describes a generic API for notifying device drivers -of a bus disconnection, and then performing error recovery. -This API is currently implemented in the 2.6.16 and later kernels. - -Reporting and recovery is performed in several steps. First, when -a PCI hardware error has resulted in a bus disconnect, that event -is reported as soon as possible to all affected device drivers, -including multiple instances of a device driver on multi-function -cards. This allows device drivers to avoid deadlocking in spinloops, -waiting for some i/o-space register to change, when it never will. -It also gives the drivers a chance to defer incoming I/O as -needed. - -Next, recovery is performed in several stages. Most of the complexity -is forced by the need to handle multi-function devices, that is, -devices that have multiple device drivers associated with them. -In the first stage, each driver is allowed to indicate what type -of reset it desires, the choices being a simple re-enabling of I/O -or requesting a slot reset. - -If any driver requests a slot reset, that is what will be done. - -After a reset and/or a re-enabling of I/O, all drivers are -again notified, so that they may then perform any device setup/config -that may be required. After these have all completed, a final -"resume normal operations" event is sent out. - -The biggest reason for choosing a kernel-based implementation rather -than a user-space implementation was the need to deal with bus -disconnects of PCI devices attached to storage media, and, in particular, -disconnects from devices holding the root file system. If the root -file system is disconnected, a user-space mechanism would have to go -through a large number of contortions to complete recovery. Almost all -of the current Linux file systems are not tolerant of disconnection -from/reconnection to their underlying block device. By contrast, -bus errors are easy to manage in the device driver. Indeed, most -device drivers already handle very similar recovery procedures; -for example, the SCSI-generic layer already provides significant -mechanisms for dealing with SCSI bus errors and SCSI bus resets. - - -Detailed Design ---------------- -Design and implementation details below, based on a chain of -public email discussions with Ben Herrenschmidt, circa 5 April 2005. - -The error recovery API support is exposed to the driver in the form of -a structure of function pointers pointed to by a new field in struct -pci_driver. A driver that fails to provide the structure is "non-aware", -and the actual recovery steps taken are platform dependent. The -arch/powerpc implementation will simulate a PCI hotplug remove/add. - -This structure has the form: -struct pci_error_handlers -{ - int (*error_detected)(struct pci_dev *dev, enum pci_channel_state); - int (*mmio_enabled)(struct pci_dev *dev); - int (*slot_reset)(struct pci_dev *dev); - void (*resume)(struct pci_dev *dev); -}; - -The possible channel states are: -enum pci_channel_state { - pci_channel_io_normal, /* I/O channel is in normal state */ - pci_channel_io_frozen, /* I/O to channel is blocked */ - pci_channel_io_perm_failure, /* PCI card is dead */ -}; - -Possible return values are: -enum pci_ers_result { - PCI_ERS_RESULT_NONE, /* no result/none/not supported in device driver */ - PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */ - PCI_ERS_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */ - PCI_ERS_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */ - PCI_ERS_RESULT_RECOVERED, /* Device driver is fully recovered and operational */ -}; - -A driver does not have to implement all of these callbacks; however, -if it implements any, it must implement error_detected(). If a callback -is not implemented, the corresponding feature is considered unsupported. -For example, if mmio_enabled() and resume() aren't there, then it -is assumed that the driver is not doing any direct recovery and requires -a slot reset. Typically a driver will want to know about -a slot_reset(). - -The actual steps taken by a platform to recover from a PCI error -event will be platform-dependent, but will follow the general -sequence described below. - -STEP 0: Error Event -------------------- -A PCI bus error is detected by the PCI hardware. On powerpc, the slot -is isolated, in that all I/O is blocked: all reads return 0xffffffff, -all writes are ignored. - - -STEP 1: Notification --------------------- -Platform calls the error_detected() callback on every instance of -every driver affected by the error. - -At this point, the device might not be accessible anymore, depending on -the platform (the slot will be isolated on powerpc). The driver may -already have "noticed" the error because of a failing I/O, but this -is the proper "synchronization point", that is, it gives the driver -a chance to cleanup, waiting for pending stuff (timers, whatever, etc...) -to complete; it can take semaphores, schedule, etc... everything but -touch the device. Within this function and after it returns, the driver -shouldn't do any new IOs. Called in task context. This is sort of a -"quiesce" point. See note about interrupts at the end of this doc. - -All drivers participating in this system must implement this call. -The driver must return one of the following result codes: - - PCI_ERS_RESULT_CAN_RECOVER: - Driver returns this if it thinks it might be able to recover - the HW by just banging IOs or if it wants to be given - a chance to extract some diagnostic information (see - mmio_enable, below). - - PCI_ERS_RESULT_NEED_RESET: - Driver returns this if it can't recover without a - slot reset. - - PCI_ERS_RESULT_DISCONNECT: - Driver returns this if it doesn't want to recover at all. - -The next step taken will depend on the result codes returned by the -drivers. - -If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER, -then the platform should re-enable IOs on the slot (or do nothing in -particular, if the platform doesn't isolate slots), and recovery -proceeds to STEP 2 (MMIO Enable). - -If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET), -then recovery proceeds to STEP 4 (Slot Reset). - -If the platform is unable to recover the slot, the next step -is STEP 6 (Permanent Failure). - ->>> The current powerpc implementation assumes that a device driver will ->>> *not* schedule or semaphore in this routine; the current powerpc ->>> implementation uses one kernel thread to notify all devices; ->>> thus, if one device sleeps/schedules, all devices are affected. ->>> Doing better requires complex multi-threaded logic in the error ->>> recovery implementation (e.g. waiting for all notification threads ->>> to "join" before proceeding with recovery.) This seems excessively ->>> complex and not worth implementing. - ->>> The current powerpc implementation doesn't much care if the device ->>> attempts I/O at this point, or not. I/O's will fail, returning ->>> a value of 0xff on read, and writes will be dropped. If more than ->>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH ->>> assumes that the device driver has gone into an infinite loop ->>> and prints an error to syslog. A reboot is then required to ->>> get the device working again. - -STEP 2: MMIO Enabled -------------------- -The platform re-enables MMIO to the device (but typically not the -DMA), and then calls the mmio_enabled() callback on all affected -device drivers. - -This is the "early recovery" call. IOs are allowed again, but DMA is -not, with some restrictions. This is NOT a callback for the driver to -start operations again, only to peek/poke at the device, extract diagnostic -information, if any, and eventually do things like trigger a device local -reset or some such, but not restart operations. This callback is made if -all drivers on a segment agree that they can try to recover and if no automatic -link reset was performed by the HW. If the platform can't just re-enable IOs -without a slot reset or a link reset, it will not call this callback, and -instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset) - ->>> The following is proposed; no platform implements this yet: ->>> Proposal: All I/O's should be done _synchronously_ from within ->>> this callback, errors triggered by them will be returned via ->>> the normal pci_check_whatever() API, no new error_detected() ->>> callback will be issued due to an error happening here. However, ->>> such an error might cause IOs to be re-blocked for the whole ->>> segment, and thus invalidate the recovery that other devices ->>> on the same segment might have done, forcing the whole segment ->>> into one of the next states, that is, link reset or slot reset. - -The driver should return one of the following result codes: - - PCI_ERS_RESULT_RECOVERED - Driver returns this if it thinks the device is fully - functional and thinks it is ready to start - normal driver operations again. There is no - guarantee that the driver will actually be - allowed to proceed, as another driver on the - same segment might have failed and thus triggered a - slot reset on platforms that support it. - - - PCI_ERS_RESULT_NEED_RESET - Driver returns this if it thinks the device is not - recoverable in its current state and it needs a slot - reset to proceed. - - - PCI_ERS_RESULT_DISCONNECT - Same as above. Total failure, no recovery even after - reset driver dead. (To be defined more precisely) - -The next step taken depends on the results returned by the drivers. -If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform -proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations). - -If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform -proceeds to STEP 4 (Slot Reset) - -STEP 3: Link Reset ------------------- -The platform resets the link. This is a PCI-Express specific step -and is done whenever a fatal error has been detected that can be -"solved" by resetting the link. - -STEP 4: Slot Reset ------------------- - -In response to a return value of PCI_ERS_RESULT_NEED_RESET, the -the platform will perform a slot reset on the requesting PCI device(s). -The actual steps taken by a platform to perform a slot reset -will be platform-dependent. Upon completion of slot reset, the -platform will call the device slot_reset() callback. - -Powerpc platforms implement two levels of slot reset: -soft reset(default) and fundamental(optional) reset. - -Powerpc soft reset consists of asserting the adapter #RST line and then -restoring the PCI BAR's and PCI configuration header to a state -that is equivalent to what it would be after a fresh system -power-on followed by power-on BIOS/system firmware initialization. -Soft reset is also known as hot-reset. - -Powerpc fundamental reset is supported by PCI Express cards only -and results in device's state machines, hardware logic, port states and -configuration registers to initialize to their default conditions. - -For most PCI devices, a soft reset will be sufficient for recovery. -Optional fundamental reset is provided to support a limited number -of PCI Express devices for which a soft reset is not sufficient -for recovery. - -If the platform supports PCI hotplug, then the reset might be -performed by toggling the slot electrical power off/on. - -It is important for the platform to restore the PCI config space -to the "fresh poweron" state, rather than the "last state". After -a slot reset, the device driver will almost always use its standard -device initialization routines, and an unusual config space setup -may result in hung devices, kernel panics, or silent data corruption. - -This call gives drivers the chance to re-initialize the hardware -(re-download firmware, etc.). At this point, the driver may assume -that the card is in a fresh state and is fully functional. The slot -is unfrozen and the driver has full access to PCI config space, -memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X) -will also be available. - -Drivers should not restart normal I/O processing operations -at this point. If all device drivers report success on this -callback, the platform will call resume() to complete the sequence, -and let the driver restart normal I/O processing. - -A driver can still return a critical failure for this function if -it can't get the device operational after reset. If the platform -previously tried a soft reset, it might now try a hard reset (power -cycle) and then call slot_reset() again. It the device still can't -be recovered, there is nothing more that can be done; the platform -will typically report a "permanent failure" in such a case. The -device will be considered "dead" in this case. - -Drivers for multi-function cards will need to coordinate among -themselves as to which driver instance will perform any "one-shot" -or global device initialization. For example, the Symbios sym53cxx2 -driver performs device init only from PCI function 0: - -+ if (PCI_FUNC(pdev->devfn) == 0) -+ sym_reset_scsi_bus(np, 0); - - Result codes: - - PCI_ERS_RESULT_DISCONNECT - Same as above. - -Drivers for PCI Express cards that require a fundamental reset must -set the needs_freset bit in the pci_dev structure in their probe function. -For example, the QLogic qla2xxx driver sets the needs_freset bit for certain -PCI card types: - -+ /* Set EEH reset type to fundamental if required by hba */ -+ if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha)) -+ pdev->needs_freset = 1; -+ - -Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent -Failure). - ->>> The current powerpc implementation does not try a power-cycle ->>> reset if the driver returned PCI_ERS_RESULT_DISCONNECT. ->>> However, it probably should. - - -STEP 5: Resume Operations -------------------------- -The platform will call the resume() callback on all affected device -drivers if all drivers on the segment have returned -PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks. -The goal of this callback is to tell the driver to restart activity, -that everything is back and running. This callback does not return -a result code. - -At this point, if a new error happens, the platform will restart -a new error recovery sequence. - -STEP 6: Permanent Failure -------------------------- -A "permanent failure" has occurred, and the platform cannot recover -the device. The platform will call error_detected() with a -pci_channel_state value of pci_channel_io_perm_failure. - -The device driver should, at this point, assume the worst. It should -cancel all pending I/O, refuse all new I/O, returning -EIO to -higher layers. The device driver should then clean up all of its -memory and remove itself from kernel operations, much as it would -during system shutdown. - -The platform will typically notify the system operator of the -permanent failure in some way. If the device is hotplug-capable, -the operator will probably want to remove and replace the device. -Note, however, not all failures are truly "permanent". Some are -caused by over-heating, some by a poorly seated card. Many -PCI error events are caused by software bugs, e.g. DMA's to -wild addresses or bogus split transactions due to programming -errors. See the discussion in powerpc/eeh-pci-error-recovery.txt -for additional detail on real-life experience of the causes of -software errors. - - -Conclusion; General Remarks ---------------------------- -The way the callbacks are called is platform policy. A platform with -no slot reset capability may want to just "ignore" drivers that can't -recover (disconnect them) and try to let other cards on the same segment -recover. Keep in mind that in most real life cases, though, there will -be only one driver per segment. - -Now, a note about interrupts. If you get an interrupt and your -device is dead or has been isolated, there is a problem :) -The current policy is to turn this into a platform policy. -That is, the recovery API only requires that: - - - There is no guarantee that interrupt delivery can proceed from any -device on the segment starting from the error detection and until the -slot_reset callback is called, at which point interrupts are expected -to be fully operational. - - - There is no guarantee that interrupt delivery is stopped, that is, -a driver that gets an interrupt after detecting an error, or that detects -an error within the interrupt handler such that it prevents proper -ack'ing of the interrupt (and thus removal of the source) should just -return IRQ_NOTHANDLED. It's up to the platform to deal with that -condition, typically by masking the IRQ source during the duration of -the error handling. It is expected that the platform "knows" which -interrupts are routed to error-management capable slots and can deal -with temporarily disabling that IRQ number during error processing (this -isn't terribly complex). That means some IRQ latency for other devices -sharing the interrupt, but there is simply no other way. High end -platforms aren't supposed to share interrupts between many devices -anyway :) - ->>> Implementation details for the powerpc platform are discussed in ->>> the file Documentation/powerpc/eeh-pci-error-recovery.txt - ->>> As of this writing, there is a growing list of device drivers with ->>> patches implementing error recovery. Not all of these patches are in ->>> mainline yet. These may be used as "examples": ->>> ->>> drivers/scsi/ipr ->>> drivers/scsi/sym53c8xx_2 ->>> drivers/scsi/qla2xxx ->>> drivers/scsi/lpfc ->>> drivers/next/bnx2.c ->>> drivers/next/e100.c ->>> drivers/net/e1000 ->>> drivers/net/e1000e ->>> drivers/net/ixgb ->>> drivers/net/ixgbe ->>> drivers/net/cxgb3 ->>> drivers/net/s2io.c ->>> drivers/net/qlge - -The End -------- -- cgit From 4e37f055a92e4a813b29fb196a05a6a826abb790 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:30 +0800 Subject: Documentation: PCI: convert pcieaer-howto.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/index.rst | 1 + Documentation/PCI/pcieaer-howto.rst | 311 ++++++++++++++++++++++++++++++++++++ Documentation/PCI/pcieaer-howto.txt | 267 ------------------------------- 3 files changed, 312 insertions(+), 267 deletions(-) create mode 100644 Documentation/PCI/pcieaer-howto.rst delete mode 100644 Documentation/PCI/pcieaer-howto.txt (limited to 'Documentation') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 92e62d0fc9e6..f54b65b1ca5f 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -14,3 +14,4 @@ Linux PCI Bus Subsystem msi-howto acpi-info pci-error-recovery + pcieaer-howto diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst new file mode 100644 index 000000000000..18bdefaafd1a --- /dev/null +++ b/Documentation/PCI/pcieaer-howto.rst @@ -0,0 +1,311 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================================== +The PCI Express Advanced Error Reporting Driver Guide HOWTO +=========================================================== + +:Authors: - T. Long Nguyen + - Yanmin Zhang + +:Copyright: |copy| 2006 Intel Corporation + +Overview +=========== + +About this guide +---------------- + +This guide describes the basics of the PCI Express Advanced Error +Reporting (AER) driver and provides information on how to use it, as +well as how to enable the drivers of endpoint devices to conform with +PCI Express AER driver. + + +What is the PCI Express AER Driver? +----------------------------------- + +PCI Express error signaling can occur on the PCI Express link itself +or on behalf of transactions initiated on the link. PCI Express +defines two error reporting paradigms: the baseline capability and +the Advanced Error Reporting capability. The baseline capability is +required of all PCI Express components providing a minimum defined +set of error reporting requirements. Advanced Error Reporting +capability is implemented with a PCI Express advanced error reporting +extended capability structure providing more robust error reporting. + +The PCI Express AER driver provides the infrastructure to support PCI +Express Advanced Error Reporting capability. The PCI Express AER +driver provides three basic functions: + + - Gathers the comprehensive error information if errors occurred. + - Reports error to the users. + - Performs error recovery actions. + +AER driver only attaches root ports which support PCI-Express AER +capability. + + +User Guide +========== + +Include the PCI Express AER Root Driver into the Linux Kernel +------------------------------------------------------------- + +The PCI Express AER Root driver is a Root Port service driver attached +to the PCI Express Port Bus driver. If a user wants to use it, the driver +has to be compiled. Option CONFIG_PCIEAER supports this capability. It +depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and +CONFIG_PCIEAER = y. + +Load PCI Express AER Root Driver +-------------------------------- + +Some systems have AER support in firmware. Enabling Linux AER support at +the same time the firmware handles AER may result in unpredictable +behavior. Therefore, Linux does not handle AER events unless the firmware +grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0 +Specification for details regarding _OSC usage. + +AER error output +---------------- + +When a PCIe AER error is captured, an error message will be output to +console. If it's a correctable error, it is output as a warning. +Otherwise, it is printed as an error. So users could choose different +log level to filter out correctable error messages. + +Below shows an example:: + + 0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID) + 0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000 + 0000:50:00.0: [20] Unsupported Request (First) + 0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100 + +In the example, 'Requester ID' means the ID of the device who sends +the error message to root port. Pls. refer to pci express specs for +other fields. + +AER Statistics / Counters +------------------------- + +When PCIe AER errors are captured, the counters / statistics are also exposed +in the form of sysfs attributes which are documented at +Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats + +Developer Guide +=============== + +To enable AER aware support requires a software driver to configure +the AER capability structure within its device and to provide callbacks. + +To support AER better, developers need understand how AER does work +firstly. + +PCI Express errors are classified into two types: correctable errors +and uncorrectable errors. This classification is based on the impacts +of those errors, which may result in degraded performance or function +failure. + +Correctable errors pose no impacts on the functionality of the +interface. The PCI Express protocol can recover without any software +intervention or any loss of data. These errors are detected and +corrected by hardware. Unlike correctable errors, uncorrectable +errors impact functionality of the interface. Uncorrectable errors +can cause a particular transaction or a particular PCI Express link +to be unreliable. Depending on those error conditions, uncorrectable +errors are further classified into non-fatal errors and fatal errors. +Non-fatal errors cause the particular transaction to be unreliable, +but the PCI Express link itself is fully functional. Fatal errors, on +the other hand, cause the link to be unreliable. + +When AER is enabled, a PCI Express device will automatically send an +error message to the PCIe root port above it when the device captures +an error. The Root Port, upon receiving an error reporting message, +internally processes and logs the error message in its PCI Express +capability structure. Error information being logged includes storing +the error reporting agent's requestor ID into the Error Source +Identification Registers and setting the error bits of the Root Error +Status Register accordingly. If AER error reporting is enabled in Root +Error Command Register, the Root Port generates an interrupt if an +error is detected. + +Note that the errors as described above are related to the PCI Express +hierarchy and links. These errors do not include any device specific +errors because device specific errors will still get sent directly to +the device driver. + +Configure the AER capability structure +-------------------------------------- + +AER aware drivers of PCI Express component need change the device +control registers to enable AER. They also could change AER registers, +including mask and severity registers. Helper function +pci_enable_pcie_error_reporting could be used to enable AER. See +section 3.3. + +Provide callbacks +----------------- + +callback reset_link to reset pci express link +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This callback is used to reset the pci express physical link when a +fatal error happens. The root port aer service driver provides a +default reset_link function, but different upstream ports might +have different specifications to reset pci express link, so all +upstream ports should provide their own reset_link functions. + +In struct pcie_port_service_driver, a new pointer, reset_link, is +added. +:: + + pci_ers_result_t (*reset_link) (struct pci_dev *dev); + +Section 3.2.2.2 provides more detailed info on when to call +reset_link. + +PCI error-recovery callbacks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The PCI Express AER Root driver uses error callbacks to coordinate +with downstream device drivers associated with a hierarchy in question +when performing error recovery actions. + +Data struct pci_driver has a pointer, err_handler, to point to +pci_error_handlers who consists of a couple of callback function +pointers. AER driver follows the rules defined in +pci-error-recovery.txt except pci express specific parts (e.g. +reset_link). Pls. refer to pci-error-recovery.txt for detailed +definitions of the callbacks. + +Below sections specify when to call the error callback functions. + +Correctable errors +~~~~~~~~~~~~~~~~~~ + +Correctable errors pose no impacts on the functionality of +the interface. The PCI Express protocol can recover without any +software intervention or any loss of data. These errors do not +require any recovery actions. The AER driver clears the device's +correctable error status register accordingly and logs these errors. + +Non-correctable (non-fatal and fatal) errors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If an error message indicates a non-fatal error, performing link reset +at upstream is not required. The AER driver calls error_detected(dev, +pci_channel_io_normal) to all drivers associated within a hierarchy in +question. for example:: + + EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort + +If Upstream port A captures an AER error, the hierarchy consists of +Downstream port B and EndPoint. + +A driver may return PCI_ERS_RESULT_CAN_RECOVER, +PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on +whether it can recover or the AER driver calls mmio_enabled as next. + +If an error message indicates a fatal error, kernel will broadcast +error_detected(dev, pci_channel_io_frozen) to all drivers within +a hierarchy in question. Then, performing link reset at upstream is +necessary. As different kinds of devices might use different approaches +to reset link, AER port service driver is required to provide the +function to reset link. Firstly, kernel looks for if the upstream +component has an aer driver. If it has, kernel uses the reset_link +callback of the aer driver. If the upstream component has no aer driver +and the port is downstream port, we will perform a hot reset as the +default by setting the Secondary Bus Reset bit of the Bridge Control +register associated with the downstream port. As for upstream ports, +they should provide their own aer service drivers with reset_link +function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and +reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes +to mmio_enabled. + +helper functions +---------------- +:: + + int pci_enable_pcie_error_reporting(struct pci_dev *dev); + +pci_enable_pcie_error_reporting enables the device to send error +messages to root port when an error is detected. Note that devices +don't enable the error reporting by default, so device drivers need +call this function to enable it. + +:: + + int pci_disable_pcie_error_reporting(struct pci_dev *dev); + +pci_disable_pcie_error_reporting disables the device to send error +messages to root port when an error is detected. + +:: + + int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);` + +pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable +error status register. + +Frequent Asked Questions +------------------------ + +Q: + What happens if a PCI Express device driver does not provide an + error recovery handler (pci_driver->err_handler is equal to NULL)? + +A: + The devices attached with the driver won't be recovered. If the + error is fatal, kernel will print out warning messages. Please refer + to section 3 for more information. + +Q: + What happens if an upstream port service driver does not provide + callback reset_link? + +A: + Fatal error recovery will fail if the errors are reported by the + upstream ports who are attached by the service driver. + +Q: + How does this infrastructure deal with driver that is not PCI + Express aware? + +A: + This infrastructure calls the error callback functions of the + driver when an error happens. But if the driver is not aware of + PCI Express, the device might not report its own errors to root + port. + +Q: + What modifications will that driver need to make it compatible + with the PCI Express AER Root driver? + +A: + It could call the helper functions to enable AER in devices and + cleanup uncorrectable status register. Pls. refer to section 3.3. + + +Software error injection +======================== + +Debugging PCIe AER error recovery code is quite difficult because it +is hard to trigger real hardware errors. Software based error +injection can be used to fake various kinds of PCIe errors. + +First you should enable PCIe AER software error injection in kernel +configuration, that is, following item should be in your .config. + +CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m + +After reboot with new kernel or insert the module, a device file named +/dev/aer_inject should be created. + +Then, you need a user space tool named aer-inject, which can be gotten +from: + + https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/ + +More information about aer-inject can be found in the document comes +with its source code. diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt deleted file mode 100644 index 48ce7903e3c6..000000000000 --- a/Documentation/PCI/pcieaer-howto.txt +++ /dev/null @@ -1,267 +0,0 @@ - The PCI Express Advanced Error Reporting Driver Guide HOWTO - T. Long Nguyen - Yanmin Zhang - 07/29/2006 - - -1. Overview - -1.1 About this guide - -This guide describes the basics of the PCI Express Advanced Error -Reporting (AER) driver and provides information on how to use it, as -well as how to enable the drivers of endpoint devices to conform with -PCI Express AER driver. - -1.2 Copyright (C) Intel Corporation 2006. - -1.3 What is the PCI Express AER Driver? - -PCI Express error signaling can occur on the PCI Express link itself -or on behalf of transactions initiated on the link. PCI Express -defines two error reporting paradigms: the baseline capability and -the Advanced Error Reporting capability. The baseline capability is -required of all PCI Express components providing a minimum defined -set of error reporting requirements. Advanced Error Reporting -capability is implemented with a PCI Express advanced error reporting -extended capability structure providing more robust error reporting. - -The PCI Express AER driver provides the infrastructure to support PCI -Express Advanced Error Reporting capability. The PCI Express AER -driver provides three basic functions: - -- Gathers the comprehensive error information if errors occurred. -- Reports error to the users. -- Performs error recovery actions. - -AER driver only attaches root ports which support PCI-Express AER -capability. - - -2. User Guide - -2.1 Include the PCI Express AER Root Driver into the Linux Kernel - -The PCI Express AER Root driver is a Root Port service driver attached -to the PCI Express Port Bus driver. If a user wants to use it, the driver -has to be compiled. Option CONFIG_PCIEAER supports this capability. It -depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and -CONFIG_PCIEAER = y. - -2.2 Load PCI Express AER Root Driver - -Some systems have AER support in firmware. Enabling Linux AER support at -the same time the firmware handles AER may result in unpredictable -behavior. Therefore, Linux does not handle AER events unless the firmware -grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0 -Specification for details regarding _OSC usage. - -2.3 AER error output - -When a PCIe AER error is captured, an error message will be output to -console. If it's a correctable error, it is output as a warning. -Otherwise, it is printed as an error. So users could choose different -log level to filter out correctable error messages. - -Below shows an example: -0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID) -0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000 -0000:50:00.0: [20] Unsupported Request (First) -0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100 - -In the example, 'Requester ID' means the ID of the device who sends -the error message to root port. Pls. refer to pci express specs for -other fields. - -2.4 AER Statistics / Counters - -When PCIe AER errors are captured, the counters / statistics are also exposed -in the form of sysfs attributes which are documented at -Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats - -3. Developer Guide - -To enable AER aware support requires a software driver to configure -the AER capability structure within its device and to provide callbacks. - -To support AER better, developers need understand how AER does work -firstly. - -PCI Express errors are classified into two types: correctable errors -and uncorrectable errors. This classification is based on the impacts -of those errors, which may result in degraded performance or function -failure. - -Correctable errors pose no impacts on the functionality of the -interface. The PCI Express protocol can recover without any software -intervention or any loss of data. These errors are detected and -corrected by hardware. Unlike correctable errors, uncorrectable -errors impact functionality of the interface. Uncorrectable errors -can cause a particular transaction or a particular PCI Express link -to be unreliable. Depending on those error conditions, uncorrectable -errors are further classified into non-fatal errors and fatal errors. -Non-fatal errors cause the particular transaction to be unreliable, -but the PCI Express link itself is fully functional. Fatal errors, on -the other hand, cause the link to be unreliable. - -When AER is enabled, a PCI Express device will automatically send an -error message to the PCIe root port above it when the device captures -an error. The Root Port, upon receiving an error reporting message, -internally processes and logs the error message in its PCI Express -capability structure. Error information being logged includes storing -the error reporting agent's requestor ID into the Error Source -Identification Registers and setting the error bits of the Root Error -Status Register accordingly. If AER error reporting is enabled in Root -Error Command Register, the Root Port generates an interrupt if an -error is detected. - -Note that the errors as described above are related to the PCI Express -hierarchy and links. These errors do not include any device specific -errors because device specific errors will still get sent directly to -the device driver. - -3.1 Configure the AER capability structure - -AER aware drivers of PCI Express component need change the device -control registers to enable AER. They also could change AER registers, -including mask and severity registers. Helper function -pci_enable_pcie_error_reporting could be used to enable AER. See -section 3.3. - -3.2. Provide callbacks - -3.2.1 callback reset_link to reset pci express link - -This callback is used to reset the pci express physical link when a -fatal error happens. The root port aer service driver provides a -default reset_link function, but different upstream ports might -have different specifications to reset pci express link, so all -upstream ports should provide their own reset_link functions. - -In struct pcie_port_service_driver, a new pointer, reset_link, is -added. - -pci_ers_result_t (*reset_link) (struct pci_dev *dev); - -Section 3.2.2.2 provides more detailed info on when to call -reset_link. - -3.2.2 PCI error-recovery callbacks - -The PCI Express AER Root driver uses error callbacks to coordinate -with downstream device drivers associated with a hierarchy in question -when performing error recovery actions. - -Data struct pci_driver has a pointer, err_handler, to point to -pci_error_handlers who consists of a couple of callback function -pointers. AER driver follows the rules defined in -pci-error-recovery.txt except pci express specific parts (e.g. -reset_link). Pls. refer to pci-error-recovery.txt for detailed -definitions of the callbacks. - -Below sections specify when to call the error callback functions. - -3.2.2.1 Correctable errors - -Correctable errors pose no impacts on the functionality of -the interface. The PCI Express protocol can recover without any -software intervention or any loss of data. These errors do not -require any recovery actions. The AER driver clears the device's -correctable error status register accordingly and logs these errors. - -3.2.2.2 Non-correctable (non-fatal and fatal) errors - -If an error message indicates a non-fatal error, performing link reset -at upstream is not required. The AER driver calls error_detected(dev, -pci_channel_io_normal) to all drivers associated within a hierarchy in -question. for example, -EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort. -If Upstream port A captures an AER error, the hierarchy consists of -Downstream port B and EndPoint. - -A driver may return PCI_ERS_RESULT_CAN_RECOVER, -PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on -whether it can recover or the AER driver calls mmio_enabled as next. - -If an error message indicates a fatal error, kernel will broadcast -error_detected(dev, pci_channel_io_frozen) to all drivers within -a hierarchy in question. Then, performing link reset at upstream is -necessary. As different kinds of devices might use different approaches -to reset link, AER port service driver is required to provide the -function to reset link. Firstly, kernel looks for if the upstream -component has an aer driver. If it has, kernel uses the reset_link -callback of the aer driver. If the upstream component has no aer driver -and the port is downstream port, we will perform a hot reset as the -default by setting the Secondary Bus Reset bit of the Bridge Control -register associated with the downstream port. As for upstream ports, -they should provide their own aer service drivers with reset_link -function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and -reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes -to mmio_enabled. - -3.3 helper functions - -3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev); -pci_enable_pcie_error_reporting enables the device to send error -messages to root port when an error is detected. Note that devices -don't enable the error reporting by default, so device drivers need -call this function to enable it. - -3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev); -pci_disable_pcie_error_reporting disables the device to send error -messages to root port when an error is detected. - -3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev); -pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable -error status register. - -3.4 Frequent Asked Questions - -Q: What happens if a PCI Express device driver does not provide an -error recovery handler (pci_driver->err_handler is equal to NULL)? - -A: The devices attached with the driver won't be recovered. If the -error is fatal, kernel will print out warning messages. Please refer -to section 3 for more information. - -Q: What happens if an upstream port service driver does not provide -callback reset_link? - -A: Fatal error recovery will fail if the errors are reported by the -upstream ports who are attached by the service driver. - -Q: How does this infrastructure deal with driver that is not PCI -Express aware? - -A: This infrastructure calls the error callback functions of the -driver when an error happens. But if the driver is not aware of -PCI Express, the device might not report its own errors to root -port. - -Q: What modifications will that driver need to make it compatible -with the PCI Express AER Root driver? - -A: It could call the helper functions to enable AER in devices and -cleanup uncorrectable status register. Pls. refer to section 3.3. - - -4. Software error injection - -Debugging PCIe AER error recovery code is quite difficult because it -is hard to trigger real hardware errors. Software based error -injection can be used to fake various kinds of PCIe errors. - -First you should enable PCIe AER software error injection in kernel -configuration, that is, following item should be in your .config. - -CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m - -After reboot with new kernel or insert the module, a device file named -/dev/aer_inject should be created. - -Then, you need a user space tool named aer-inject, which can be gotten -from: - https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/ - -More information about aer-inject can be found in the document comes -with its source code. -- cgit From d8946fc38517755a2d9535a04c3f6a8d3a331eee Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:31 +0800 Subject: Documentation: PCI: convert endpoint/pci-endpoint.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/endpoint/index.rst | 10 ++ Documentation/PCI/endpoint/pci-endpoint.rst | 231 ++++++++++++++++++++++++++++ Documentation/PCI/endpoint/pci-endpoint.txt | 215 -------------------------- Documentation/PCI/index.rst | 1 + 4 files changed, 242 insertions(+), 215 deletions(-) create mode 100644 Documentation/PCI/endpoint/index.rst create mode 100644 Documentation/PCI/endpoint/pci-endpoint.rst delete mode 100644 Documentation/PCI/endpoint/pci-endpoint.txt (limited to 'Documentation') diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst new file mode 100644 index 000000000000..0db4f2fcd7f0 --- /dev/null +++ b/Documentation/PCI/endpoint/index.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +PCI Endpoint Framework +====================== + +.. toctree:: + :maxdepth: 2 + + pci-endpoint diff --git a/Documentation/PCI/endpoint/pci-endpoint.rst b/Documentation/PCI/endpoint/pci-endpoint.rst new file mode 100644 index 000000000000..0e2311b5617b --- /dev/null +++ b/Documentation/PCI/endpoint/pci-endpoint.rst @@ -0,0 +1,231 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Author: Kishon Vijay Abraham I + +This document is a guide to use the PCI Endpoint Framework in order to create +endpoint controller driver, endpoint function driver, and using configfs +interface to bind the function driver to the controller driver. + +Introduction +============ + +Linux has a comprehensive PCI subsystem to support PCI controllers that +operates in Root Complex mode. The subsystem has capability to scan PCI bus, +assign memory resources and IRQ resources, load PCI driver (based on +vendor ID, device ID), support other services like hot-plug, power management, +advanced error reporting and virtual channels. + +However the PCI controller IP integrated in some SoCs is capable of operating +either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will +add endpoint mode support in Linux. This will help to run Linux in an +EP system which can have a wide variety of use cases from testing or +validation, co-processor accelerator, etc. + +PCI Endpoint Core +================= + +The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller +library, the Endpoint Function library, and the configfs layer to bind the +endpoint function with the endpoint controller. + +PCI Endpoint Controller(EPC) Library +------------------------------------ + +The EPC library provides APIs to be used by the controller that can operate +in endpoint mode. It also provides APIs to be used by function driver/library +in order to implement a particular endpoint function. + +APIs for the PCI controller Driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI controller driver. + +* devm_pci_epc_create()/pci_epc_create() + + The PCI controller driver should implement the following ops: + + * write_header: ops to populate configuration space header + * set_bar: ops to configure the BAR + * clear_bar: ops to reset the BAR + * alloc_addr_space: ops to allocate in PCI controller address space + * free_addr_space: ops to free the allocated address space + * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt + * start: ops to start the PCI link + * stop: ops to stop the PCI link + + The PCI controller driver can then create a new EPC device by invoking + devm_pci_epc_create()/pci_epc_create(). + +* devm_pci_epc_destroy()/pci_epc_destroy() + + The PCI controller driver can destroy the EPC device created by either + devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or + pci_epc_destroy(). + +* pci_epc_linkup() + + In order to notify all the function devices that the EPC device to which + they are linked has established a link with the host, the PCI controller + driver should invoke pci_epc_linkup(). + +* pci_epc_mem_init() + + Initialize the pci_epc_mem structure used for allocating EPC addr space. + +* pci_epc_mem_exit() + + Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init(). + + +APIs for the PCI Endpoint Function Driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint function driver. + +* pci_epc_write_header() + + The PCI endpoint function driver should use pci_epc_write_header() to + write the standard configuration header to the endpoint controller. + +* pci_epc_set_bar() + + The PCI endpoint function driver should use pci_epc_set_bar() to configure + the Base Address Register in order for the host to assign PCI addr space. + Register space of the function driver is usually configured + using this API. + +* pci_epc_clear_bar() + + The PCI endpoint function driver should use pci_epc_clear_bar() to reset + the BAR. + +* pci_epc_raise_irq() + + The PCI endpoint function driver should use pci_epc_raise_irq() to raise + Legacy Interrupt, MSI or MSI-X Interrupt. + +* pci_epc_mem_alloc_addr() + + The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to + allocate memory address from EPC addr space which is required to access + RC's buffer + +* pci_epc_mem_free_addr() + + The PCI endpoint function driver should use pci_epc_mem_free_addr() to + free the memory space allocated using pci_epc_mem_alloc_addr(). + +Other APIs +~~~~~~~~~~ + +There are other APIs provided by the EPC library. These are used for binding +the EPF device with EPC device. pci-ep-cfs.c can be used as reference for +using these APIs. + +* pci_epc_get() + + Get a reference to the PCI endpoint controller based on the device name of + the controller. + +* pci_epc_put() + + Release the reference to the PCI endpoint controller obtained using + pci_epc_get() + +* pci_epc_add_epf() + + Add a PCI endpoint function to a PCI endpoint controller. A PCIe device + can have up to 8 functions according to the specification. + +* pci_epc_remove_epf() + + Remove the PCI endpoint function from PCI endpoint controller. + +* pci_epc_start() + + The PCI endpoint function driver should invoke pci_epc_start() once it + has configured the endpoint function and wants to start the PCI link. + +* pci_epc_stop() + + The PCI endpoint function driver should invoke pci_epc_stop() to stop + the PCI LINK. + + +PCI Endpoint Function(EPF) Library +---------------------------------- + +The EPF library provides APIs to be used by the function driver and the EPC +library to provide endpoint mode functionality. + +APIs for the PCI Endpoint Function Driver +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint function driver. + +* pci_epf_register_driver() + + The PCI Endpoint Function driver should implement the following ops: + * bind: ops to perform when a EPC device has been bound to EPF device + * unbind: ops to perform when a binding has been lost between a EPC + device and EPF device + * linkup: ops to perform when the EPC device has established a + connection with a host system + + The PCI Function driver can then register the PCI EPF driver by using + pci_epf_register_driver(). + +* pci_epf_unregister_driver() + + The PCI Function driver can unregister the PCI EPF driver by using + pci_epf_unregister_driver(). + +* pci_epf_alloc_space() + + The PCI Function driver can allocate space for a particular BAR using + pci_epf_alloc_space(). + +* pci_epf_free_space() + + The PCI Function driver can free the allocated space + (using pci_epf_alloc_space) by invoking pci_epf_free_space(). + +APIs for the PCI Endpoint Controller Library +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint controller library. + +* pci_epf_linkup() + + The PCI endpoint controller library invokes pci_epf_linkup() when the + EPC device has established the connection to the host. + +Other APIs +~~~~~~~~~~ + +There are other APIs provided by the EPF library. These are used to notify +the function driver when the EPF device is bound to the EPC device. +pci-ep-cfs.c can be used as reference for using these APIs. + +* pci_epf_create() + + Create a new PCI EPF device by passing the name of the PCI EPF device. + This name will be used to bind the the EPF device to a EPF driver. + +* pci_epf_destroy() + + Destroy the created PCI EPF device. + +* pci_epf_bind() + + pci_epf_bind() should be invoked when the EPF device has been bound to + a EPC device. + +* pci_epf_unbind() + + pci_epf_unbind() should be invoked when the binding between EPC device + and EPF device is lost. diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt deleted file mode 100644 index e86a96b66a6a..000000000000 --- a/Documentation/PCI/endpoint/pci-endpoint.txt +++ /dev/null @@ -1,215 +0,0 @@ - PCI ENDPOINT FRAMEWORK - Kishon Vijay Abraham I - -This document is a guide to use the PCI Endpoint Framework in order to create -endpoint controller driver, endpoint function driver, and using configfs -interface to bind the function driver to the controller driver. - -1. Introduction - -Linux has a comprehensive PCI subsystem to support PCI controllers that -operates in Root Complex mode. The subsystem has capability to scan PCI bus, -assign memory resources and IRQ resources, load PCI driver (based on -vendor ID, device ID), support other services like hot-plug, power management, -advanced error reporting and virtual channels. - -However the PCI controller IP integrated in some SoCs is capable of operating -either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will -add endpoint mode support in Linux. This will help to run Linux in an -EP system which can have a wide variety of use cases from testing or -validation, co-processor accelerator, etc. - -2. PCI Endpoint Core - -The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller -library, the Endpoint Function library, and the configfs layer to bind the -endpoint function with the endpoint controller. - -2.1 PCI Endpoint Controller(EPC) Library - -The EPC library provides APIs to be used by the controller that can operate -in endpoint mode. It also provides APIs to be used by function driver/library -in order to implement a particular endpoint function. - -2.1.1 APIs for the PCI controller Driver - -This section lists the APIs that the PCI Endpoint core provides to be used -by the PCI controller driver. - -*) devm_pci_epc_create()/pci_epc_create() - - The PCI controller driver should implement the following ops: - * write_header: ops to populate configuration space header - * set_bar: ops to configure the BAR - * clear_bar: ops to reset the BAR - * alloc_addr_space: ops to allocate in PCI controller address space - * free_addr_space: ops to free the allocated address space - * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt - * start: ops to start the PCI link - * stop: ops to stop the PCI link - - The PCI controller driver can then create a new EPC device by invoking - devm_pci_epc_create()/pci_epc_create(). - -*) devm_pci_epc_destroy()/pci_epc_destroy() - - The PCI controller driver can destroy the EPC device created by either - devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or - pci_epc_destroy(). - -*) pci_epc_linkup() - - In order to notify all the function devices that the EPC device to which - they are linked has established a link with the host, the PCI controller - driver should invoke pci_epc_linkup(). - -*) pci_epc_mem_init() - - Initialize the pci_epc_mem structure used for allocating EPC addr space. - -*) pci_epc_mem_exit() - - Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init(). - -2.1.2 APIs for the PCI Endpoint Function Driver - -This section lists the APIs that the PCI Endpoint core provides to be used -by the PCI endpoint function driver. - -*) pci_epc_write_header() - - The PCI endpoint function driver should use pci_epc_write_header() to - write the standard configuration header to the endpoint controller. - -*) pci_epc_set_bar() - - The PCI endpoint function driver should use pci_epc_set_bar() to configure - the Base Address Register in order for the host to assign PCI addr space. - Register space of the function driver is usually configured - using this API. - -*) pci_epc_clear_bar() - - The PCI endpoint function driver should use pci_epc_clear_bar() to reset - the BAR. - -*) pci_epc_raise_irq() - - The PCI endpoint function driver should use pci_epc_raise_irq() to raise - Legacy Interrupt, MSI or MSI-X Interrupt. - -*) pci_epc_mem_alloc_addr() - - The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to - allocate memory address from EPC addr space which is required to access - RC's buffer - -*) pci_epc_mem_free_addr() - - The PCI endpoint function driver should use pci_epc_mem_free_addr() to - free the memory space allocated using pci_epc_mem_alloc_addr(). - -2.1.3 Other APIs - -There are other APIs provided by the EPC library. These are used for binding -the EPF device with EPC device. pci-ep-cfs.c can be used as reference for -using these APIs. - -*) pci_epc_get() - - Get a reference to the PCI endpoint controller based on the device name of - the controller. - -*) pci_epc_put() - - Release the reference to the PCI endpoint controller obtained using - pci_epc_get() - -*) pci_epc_add_epf() - - Add a PCI endpoint function to a PCI endpoint controller. A PCIe device - can have up to 8 functions according to the specification. - -*) pci_epc_remove_epf() - - Remove the PCI endpoint function from PCI endpoint controller. - -*) pci_epc_start() - - The PCI endpoint function driver should invoke pci_epc_start() once it - has configured the endpoint function and wants to start the PCI link. - -*) pci_epc_stop() - - The PCI endpoint function driver should invoke pci_epc_stop() to stop - the PCI LINK. - -2.2 PCI Endpoint Function(EPF) Library - -The EPF library provides APIs to be used by the function driver and the EPC -library to provide endpoint mode functionality. - -2.2.1 APIs for the PCI Endpoint Function Driver - -This section lists the APIs that the PCI Endpoint core provides to be used -by the PCI endpoint function driver. - -*) pci_epf_register_driver() - - The PCI Endpoint Function driver should implement the following ops: - * bind: ops to perform when a EPC device has been bound to EPF device - * unbind: ops to perform when a binding has been lost between a EPC - device and EPF device - * linkup: ops to perform when the EPC device has established a - connection with a host system - - The PCI Function driver can then register the PCI EPF driver by using - pci_epf_register_driver(). - -*) pci_epf_unregister_driver() - - The PCI Function driver can unregister the PCI EPF driver by using - pci_epf_unregister_driver(). - -*) pci_epf_alloc_space() - - The PCI Function driver can allocate space for a particular BAR using - pci_epf_alloc_space(). - -*) pci_epf_free_space() - - The PCI Function driver can free the allocated space - (using pci_epf_alloc_space) by invoking pci_epf_free_space(). - -2.2.2 APIs for the PCI Endpoint Controller Library -This section lists the APIs that the PCI Endpoint core provides to be used -by the PCI endpoint controller library. - -*) pci_epf_linkup() - - The PCI endpoint controller library invokes pci_epf_linkup() when the - EPC device has established the connection to the host. - -2.2.2 Other APIs -There are other APIs provided by the EPF library. These are used to notify -the function driver when the EPF device is bound to the EPC device. -pci-ep-cfs.c can be used as reference for using these APIs. - -*) pci_epf_create() - - Create a new PCI EPF device by passing the name of the PCI EPF device. - This name will be used to bind the the EPF device to a EPF driver. - -*) pci_epf_destroy() - - Destroy the created PCI EPF device. - -*) pci_epf_bind() - - pci_epf_bind() should be invoked when the EPF device has been bound to - a EPC device. - -*) pci_epf_unbind() - - pci_epf_unbind() should be invoked when the binding between EPC device - and EPF device is lost. diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index f54b65b1ca5f..f4c6121868c3 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -15,3 +15,4 @@ Linux PCI Bus Subsystem acpi-info pci-error-recovery pcieaer-howto + endpoint/index -- cgit From d4518e4ac64cae18f953f8a433359ea1face4b52 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:32 +0800 Subject: Documentation: PCI: convert endpoint/pci-endpoint-cfs.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/endpoint/index.rst | 1 + Documentation/PCI/endpoint/pci-endpoint-cfs.rst | 118 ++++++++++++++++++++++++ Documentation/PCI/endpoint/pci-endpoint-cfs.txt | 105 --------------------- 3 files changed, 119 insertions(+), 105 deletions(-) create mode 100644 Documentation/PCI/endpoint/pci-endpoint-cfs.rst delete mode 100644 Documentation/PCI/endpoint/pci-endpoint-cfs.txt (limited to 'Documentation') diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst index 0db4f2fcd7f0..3951de9f923c 100644 --- a/Documentation/PCI/endpoint/index.rst +++ b/Documentation/PCI/endpoint/index.rst @@ -8,3 +8,4 @@ PCI Endpoint Framework :maxdepth: 2 pci-endpoint + pci-endpoint-cfs diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.rst b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst new file mode 100644 index 000000000000..b6d39cdec56e --- /dev/null +++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst @@ -0,0 +1,118 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Configuring PCI Endpoint Using CONFIGFS +======================================= + +:Author: Kishon Vijay Abraham I + +The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the +PCI endpoint function and to bind the endpoint function +with the endpoint controller. (For introducing other mechanisms to +configure the PCI Endpoint Function refer to [1]). + +Mounting configfs +================= + +The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs +directory. configfs can be mounted using the following command:: + + mount -t configfs none /sys/kernel/config + +Directory Structure +=================== + +The pci_ep configfs has two directories at its root: controllers and +functions. Every EPC device present in the system will have an entry in +the *controllers* directory and and every EPF driver present in the system +will have an entry in the *functions* directory. +:: + + /sys/kernel/config/pci_ep/ + .. controllers/ + .. functions/ + +Creating EPF Device +=================== + +Every registered EPF driver will be listed in controllers directory. The +entries corresponding to EPF driver will be created by the EPF core. +:: + + /sys/kernel/config/pci_ep/functions/ + .. / + ... / + ... / + .. / + ... / + ... / + +In order to create a of the type probed by , the +user has to create a directory inside . + +Every directory consists of the following entries that can be +used to configure the standard configuration header of the endpoint function. +(These entries are created by the framework when any new is +created) +:: + + .. / + ... / + ... vendorid + ... deviceid + ... revid + ... progif_code + ... subclass_code + ... baseclass_code + ... cache_line_size + ... subsys_vendor_id + ... subsys_id + ... interrupt_pin + +EPC Device +========== + +Every registered EPC device will be listed in controllers directory. The +entries corresponding to EPC device will be created by the EPC core. +:: + + /sys/kernel/config/pci_ep/controllers/ + .. / + ... / + ... / + ... start + .. / + ... / + ... / + ... start + +The directory will have a list of symbolic links to +. These symbolic links should be created by the user to +represent the functions present in the endpoint device. + +The directory will also have a *start* field. Once +"1" is written to this field, the endpoint device will be ready to +establish the link with the host. This is usually done after +all the EPF devices are created and linked with the EPC device. +:: + + | controllers/ + | / + | + | start + | functions/ + | / + | / + | vendorid + | deviceid + | revid + | progif_code + | subclass_code + | baseclass_code + | cache_line_size + | subsys_vendor_id + | subsys_id + | interrupt_pin + | function + +[1] :doc:`pci-endpoint` diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt deleted file mode 100644 index d740f29960a4..000000000000 --- a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt +++ /dev/null @@ -1,105 +0,0 @@ - CONFIGURING PCI ENDPOINT USING CONFIGFS - Kishon Vijay Abraham I - -The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the -PCI endpoint function and to bind the endpoint function -with the endpoint controller. (For introducing other mechanisms to -configure the PCI Endpoint Function refer to [1]). - -*) Mounting configfs - -The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs -directory. configfs can be mounted using the following command. - - mount -t configfs none /sys/kernel/config - -*) Directory Structure - -The pci_ep configfs has two directories at its root: controllers and -functions. Every EPC device present in the system will have an entry in -the *controllers* directory and and every EPF driver present in the system -will have an entry in the *functions* directory. - -/sys/kernel/config/pci_ep/ - .. controllers/ - .. functions/ - -*) Creating EPF Device - -Every registered EPF driver will be listed in controllers directory. The -entries corresponding to EPF driver will be created by the EPF core. - -/sys/kernel/config/pci_ep/functions/ - .. / - ... / - ... / - .. / - ... / - ... / - -In order to create a of the type probed by , the -user has to create a directory inside . - -Every directory consists of the following entries that can be -used to configure the standard configuration header of the endpoint function. -(These entries are created by the framework when any new is -created) - - .. / - ... / - ... vendorid - ... deviceid - ... revid - ... progif_code - ... subclass_code - ... baseclass_code - ... cache_line_size - ... subsys_vendor_id - ... subsys_id - ... interrupt_pin - -*) EPC Device - -Every registered EPC device will be listed in controllers directory. The -entries corresponding to EPC device will be created by the EPC core. - -/sys/kernel/config/pci_ep/controllers/ - .. / - ... / - ... / - ... start - .. / - ... / - ... / - ... start - -The directory will have a list of symbolic links to -. These symbolic links should be created by the user to -represent the functions present in the endpoint device. - -The directory will also have a *start* field. Once -"1" is written to this field, the endpoint device will be ready to -establish the link with the host. This is usually done after -all the EPF devices are created and linked with the EPC device. - - - | controllers/ - | / - | - | start - | functions/ - | / - | / - | vendorid - | deviceid - | revid - | progif_code - | subclass_code - | baseclass_code - | cache_line_size - | subsys_vendor_id - | subsys_id - | interrupt_pin - | function - -[1] -> Documentation/PCI/endpoint/pci-endpoint.txt -- cgit From bf2c2658d4b6baed13c274da7091428772b5cb03 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:33 +0800 Subject: Documentation: PCI: convert endpoint/pci-test-function.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/endpoint/index.rst | 1 + Documentation/PCI/endpoint/pci-test-function.rst | 103 +++++++++++++++++++++++ Documentation/PCI/endpoint/pci-test-function.txt | 87 ------------------- 3 files changed, 104 insertions(+), 87 deletions(-) create mode 100644 Documentation/PCI/endpoint/pci-test-function.rst delete mode 100644 Documentation/PCI/endpoint/pci-test-function.txt (limited to 'Documentation') diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst index 3951de9f923c..b680a3fc4fec 100644 --- a/Documentation/PCI/endpoint/index.rst +++ b/Documentation/PCI/endpoint/index.rst @@ -9,3 +9,4 @@ PCI Endpoint Framework pci-endpoint pci-endpoint-cfs + pci-test-function diff --git a/Documentation/PCI/endpoint/pci-test-function.rst b/Documentation/PCI/endpoint/pci-test-function.rst new file mode 100644 index 000000000000..3c8521d7aa31 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-test-function.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +PCI Test Function +================= + +:Author: Kishon Vijay Abraham I + +Traditionally PCI RC has always been validated by using standard +PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards. +However with the addition of EP-core in linux kernel, it is possible +to configure a PCI controller that can operate in EP mode to work as +a test device. + +The PCI endpoint test device is a virtual device (defined in software) +used to test the endpoint functionality and serve as a sample driver +for other PCI endpoint devices (to use the EP framework). + +The PCI endpoint test device has the following registers: + + 1) PCI_ENDPOINT_TEST_MAGIC + 2) PCI_ENDPOINT_TEST_COMMAND + 3) PCI_ENDPOINT_TEST_STATUS + 4) PCI_ENDPOINT_TEST_SRC_ADDR + 5) PCI_ENDPOINT_TEST_DST_ADDR + 6) PCI_ENDPOINT_TEST_SIZE + 7) PCI_ENDPOINT_TEST_CHECKSUM + 8) PCI_ENDPOINT_TEST_IRQ_TYPE + 9) PCI_ENDPOINT_TEST_IRQ_NUMBER + +* PCI_ENDPOINT_TEST_MAGIC + +This register will be used to test BAR0. A known pattern will be written +and read back from MAGIC register to verify BAR0. + +* PCI_ENDPOINT_TEST_COMMAND + +This register will be used by the host driver to indicate the function +that the endpoint device must perform. + +======== ================================================================ +Bitfield Description +======== ================================================================ +Bit 0 raise legacy IRQ +Bit 1 raise MSI IRQ +Bit 2 raise MSI-X IRQ +Bit 3 read command (read data from RC buffer) +Bit 4 write command (write data to RC buffer) +Bit 5 copy command (copy data from one RC buffer to another RC buffer) +======== ================================================================ + +* PCI_ENDPOINT_TEST_STATUS + +This register reflects the status of the PCI endpoint device. + +======== ============================== +Bitfield Description +======== ============================== +Bit 0 read success +Bit 1 read fail +Bit 2 write success +Bit 3 write fail +Bit 4 copy success +Bit 5 copy fail +Bit 6 IRQ raised +Bit 7 source address is invalid +Bit 8 destination address is invalid +======== ============================== + +* PCI_ENDPOINT_TEST_SRC_ADDR + +This register contains the source address (RC buffer address) for the +COPY/READ command. + +* PCI_ENDPOINT_TEST_DST_ADDR + +This register contains the destination address (RC buffer address) for +the COPY/WRITE command. + +* PCI_ENDPOINT_TEST_IRQ_TYPE + +This register contains the interrupt type (Legacy/MSI) triggered +for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands. + +Possible types: + +====== == +Legacy 0 +MSI 1 +MSI-X 2 +====== == + +* PCI_ENDPOINT_TEST_IRQ_NUMBER + +This register contains the triggered ID interrupt. + +Admissible values: + +====== =========== +Legacy 0 +MSI [1 .. 32] +MSI-X [1 .. 2048] +====== =========== diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt deleted file mode 100644 index 5916f1f592bb..000000000000 --- a/Documentation/PCI/endpoint/pci-test-function.txt +++ /dev/null @@ -1,87 +0,0 @@ - PCI TEST - Kishon Vijay Abraham I - -Traditionally PCI RC has always been validated by using standard -PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards. -However with the addition of EP-core in linux kernel, it is possible -to configure a PCI controller that can operate in EP mode to work as -a test device. - -The PCI endpoint test device is a virtual device (defined in software) -used to test the endpoint functionality and serve as a sample driver -for other PCI endpoint devices (to use the EP framework). - -The PCI endpoint test device has the following registers: - - 1) PCI_ENDPOINT_TEST_MAGIC - 2) PCI_ENDPOINT_TEST_COMMAND - 3) PCI_ENDPOINT_TEST_STATUS - 4) PCI_ENDPOINT_TEST_SRC_ADDR - 5) PCI_ENDPOINT_TEST_DST_ADDR - 6) PCI_ENDPOINT_TEST_SIZE - 7) PCI_ENDPOINT_TEST_CHECKSUM - 8) PCI_ENDPOINT_TEST_IRQ_TYPE - 9) PCI_ENDPOINT_TEST_IRQ_NUMBER - -*) PCI_ENDPOINT_TEST_MAGIC - -This register will be used to test BAR0. A known pattern will be written -and read back from MAGIC register to verify BAR0. - -*) PCI_ENDPOINT_TEST_COMMAND: - -This register will be used by the host driver to indicate the function -that the endpoint device must perform. - -Bitfield Description: - Bit 0 : raise legacy IRQ - Bit 1 : raise MSI IRQ - Bit 2 : raise MSI-X IRQ - Bit 3 : read command (read data from RC buffer) - Bit 4 : write command (write data to RC buffer) - Bit 5 : copy command (copy data from one RC buffer to another - RC buffer) - -*) PCI_ENDPOINT_TEST_STATUS - -This register reflects the status of the PCI endpoint device. - -Bitfield Description: - Bit 0 : read success - Bit 1 : read fail - Bit 2 : write success - Bit 3 : write fail - Bit 4 : copy success - Bit 5 : copy fail - Bit 6 : IRQ raised - Bit 7 : source address is invalid - Bit 8 : destination address is invalid - -*) PCI_ENDPOINT_TEST_SRC_ADDR - -This register contains the source address (RC buffer address) for the -COPY/READ command. - -*) PCI_ENDPOINT_TEST_DST_ADDR - -This register contains the destination address (RC buffer address) for -the COPY/WRITE command. - -*) PCI_ENDPOINT_TEST_IRQ_TYPE - -This register contains the interrupt type (Legacy/MSI) triggered -for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands. - -Possible types: - - Legacy : 0 - - MSI : 1 - - MSI-X : 2 - -*) PCI_ENDPOINT_TEST_IRQ_NUMBER - -This register contains the triggered ID interrupt. - -Admissible values: - - Legacy : 0 - - MSI : [1 .. 32] - - MSI-X : [1 .. 2048] -- cgit From 9595aee2a389be5dfa9a0121a14e8fba70f17278 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:34 +0800 Subject: Documentation: PCI: convert endpoint/pci-test-howto.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Signed-off-by: Changbin Du Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/endpoint/index.rst | 1 + Documentation/PCI/endpoint/pci-test-howto.rst | 235 ++++++++++++++++++++++++++ Documentation/PCI/endpoint/pci-test-howto.txt | 206 ---------------------- 3 files changed, 236 insertions(+), 206 deletions(-) create mode 100644 Documentation/PCI/endpoint/pci-test-howto.rst delete mode 100644 Documentation/PCI/endpoint/pci-test-howto.txt (limited to 'Documentation') diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst index b680a3fc4fec..d114ea74b444 100644 --- a/Documentation/PCI/endpoint/index.rst +++ b/Documentation/PCI/endpoint/index.rst @@ -10,3 +10,4 @@ PCI Endpoint Framework pci-endpoint pci-endpoint-cfs pci-test-function + pci-test-howto diff --git a/Documentation/PCI/endpoint/pci-test-howto.rst b/Documentation/PCI/endpoint/pci-test-howto.rst new file mode 100644 index 000000000000..909f770a07d6 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-test-howto.rst @@ -0,0 +1,235 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +PCI Test User Guide +=================== + +:Author: Kishon Vijay Abraham I + +This document is a guide to help users use pci-epf-test function driver +and pci_endpoint_test host driver for testing PCI. The list of steps to +be followed in the host side and EP side is given below. + +Endpoint Device +=============== + +Endpoint Controller Devices +--------------------------- + +To find the list of endpoint controller devices in the system:: + + # ls /sys/class/pci_epc/ + 51000000.pcie_ep + +If PCI_ENDPOINT_CONFIGFS is enabled:: + + # ls /sys/kernel/config/pci_ep/controllers + 51000000.pcie_ep + + +Endpoint Function Drivers +------------------------- + +To find the list of endpoint function drivers in the system:: + + # ls /sys/bus/pci-epf/drivers + pci_epf_test + +If PCI_ENDPOINT_CONFIGFS is enabled:: + + # ls /sys/kernel/config/pci_ep/functions + pci_epf_test + + +Creating pci-epf-test Device +---------------------------- + +PCI endpoint function device can be created using the configfs. To create +pci-epf-test device, the following commands can be used:: + + # mount -t configfs none /sys/kernel/config + # cd /sys/kernel/config/pci_ep/ + # mkdir functions/pci_epf_test/func1 + +The "mkdir func1" above creates the pci-epf-test function device that will +be probed by pci_epf_test driver. + +The PCI endpoint framework populates the directory with the following +configurable fields:: + + # ls functions/pci_epf_test/func1 + baseclass_code interrupt_pin progif_code subsys_id + cache_line_size msi_interrupts revid subsys_vendorid + deviceid msix_interrupts subclass_code vendorid + +The PCI endpoint function driver populates these entries with default values +when the device is bound to the driver. The pci-epf-test driver populates +vendorid with 0xffff and interrupt_pin with 0x0001:: + + # cat functions/pci_epf_test/func1/vendorid + 0xffff + # cat functions/pci_epf_test/func1/interrupt_pin + 0x0001 + + +Configuring pci-epf-test Device +------------------------------- + +The user can configure the pci-epf-test device using configfs entry. In order +to change the vendorid and the number of MSI interrupts used by the function +device, the following commands can be used:: + + # echo 0x104c > functions/pci_epf_test/func1/vendorid + # echo 0xb500 > functions/pci_epf_test/func1/deviceid + # echo 16 > functions/pci_epf_test/func1/msi_interrupts + # echo 8 > functions/pci_epf_test/func1/msix_interrupts + + +Binding pci-epf-test Device to EP Controller +-------------------------------------------- + +In order for the endpoint function device to be useful, it has to be bound to +a PCI endpoint controller driver. Use the configfs to bind the function +device to one of the controller driver present in the system:: + + # ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/ + +Once the above step is completed, the PCI endpoint is ready to establish a link +with the host. + + +Start the Link +-------------- + +In order for the endpoint device to establish a link with the host, the _start_ +field should be populated with '1':: + + # echo 1 > controllers/51000000.pcie_ep/start + + +RootComplex Device +================== + +lspci Output +------------ + +Note that the devices listed here correspond to the value populated in 1.4 +above:: + + 00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01) + 01:00.0 Unassigned class [ff00]: Texas Instruments Device b500 + + +Using Endpoint Test function Device +----------------------------------- + +pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint +tests. To compile this tool the following commands should be used:: + + # cd + # make -C tools/pci + +or if you desire to compile and install in your system:: + + # cd + # make -C tools/pci install + +The tool and script will be located in /usr/bin/ + + +pcitest.sh Output +~~~~~~~~~~~~~~~~~ +:: + + # pcitest.sh + BAR tests + + BAR0: OKAY + BAR1: OKAY + BAR2: OKAY + BAR3: OKAY + BAR4: NOT OKAY + BAR5: NOT OKAY + + Interrupt tests + + SET IRQ TYPE TO LEGACY: OKAY + LEGACY IRQ: NOT OKAY + SET IRQ TYPE TO MSI: OKAY + MSI1: OKAY + MSI2: OKAY + MSI3: OKAY + MSI4: OKAY + MSI5: OKAY + MSI6: OKAY + MSI7: OKAY + MSI8: OKAY + MSI9: OKAY + MSI10: OKAY + MSI11: OKAY + MSI12: OKAY + MSI13: OKAY + MSI14: OKAY + MSI15: OKAY + MSI16: OKAY + MSI17: NOT OKAY + MSI18: NOT OKAY + MSI19: NOT OKAY + MSI20: NOT OKAY + MSI21: NOT OKAY + MSI22: NOT OKAY + MSI23: NOT OKAY + MSI24: NOT OKAY + MSI25: NOT OKAY + MSI26: NOT OKAY + MSI27: NOT OKAY + MSI28: NOT OKAY + MSI29: NOT OKAY + MSI30: NOT OKAY + MSI31: NOT OKAY + MSI32: NOT OKAY + SET IRQ TYPE TO MSI-X: OKAY + MSI-X1: OKAY + MSI-X2: OKAY + MSI-X3: OKAY + MSI-X4: OKAY + MSI-X5: OKAY + MSI-X6: OKAY + MSI-X7: OKAY + MSI-X8: OKAY + MSI-X9: NOT OKAY + MSI-X10: NOT OKAY + MSI-X11: NOT OKAY + MSI-X12: NOT OKAY + MSI-X13: NOT OKAY + MSI-X14: NOT OKAY + MSI-X15: NOT OKAY + MSI-X16: NOT OKAY + [...] + MSI-X2047: NOT OKAY + MSI-X2048: NOT OKAY + + Read Tests + + SET IRQ TYPE TO MSI: OKAY + READ ( 1 bytes): OKAY + READ ( 1024 bytes): OKAY + READ ( 1025 bytes): OKAY + READ (1024000 bytes): OKAY + READ (1024001 bytes): OKAY + + Write Tests + + WRITE ( 1 bytes): OKAY + WRITE ( 1024 bytes): OKAY + WRITE ( 1025 bytes): OKAY + WRITE (1024000 bytes): OKAY + WRITE (1024001 bytes): OKAY + + Copy Tests + + COPY ( 1 bytes): OKAY + COPY ( 1024 bytes): OKAY + COPY ( 1025 bytes): OKAY + COPY (1024000 bytes): OKAY + COPY (1024001 bytes): OKAY diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt deleted file mode 100644 index 040479f437a5..000000000000 --- a/Documentation/PCI/endpoint/pci-test-howto.txt +++ /dev/null @@ -1,206 +0,0 @@ - PCI TEST USERGUIDE - Kishon Vijay Abraham I - -This document is a guide to help users use pci-epf-test function driver -and pci_endpoint_test host driver for testing PCI. The list of steps to -be followed in the host side and EP side is given below. - -1. Endpoint Device - -1.1 Endpoint Controller Devices - -To find the list of endpoint controller devices in the system: - - # ls /sys/class/pci_epc/ - 51000000.pcie_ep - -If PCI_ENDPOINT_CONFIGFS is enabled - # ls /sys/kernel/config/pci_ep/controllers - 51000000.pcie_ep - -1.2 Endpoint Function Drivers - -To find the list of endpoint function drivers in the system: - - # ls /sys/bus/pci-epf/drivers - pci_epf_test - -If PCI_ENDPOINT_CONFIGFS is enabled - # ls /sys/kernel/config/pci_ep/functions - pci_epf_test - -1.3 Creating pci-epf-test Device - -PCI endpoint function device can be created using the configfs. To create -pci-epf-test device, the following commands can be used - - # mount -t configfs none /sys/kernel/config - # cd /sys/kernel/config/pci_ep/ - # mkdir functions/pci_epf_test/func1 - -The "mkdir func1" above creates the pci-epf-test function device that will -be probed by pci_epf_test driver. - -The PCI endpoint framework populates the directory with the following -configurable fields. - - # ls functions/pci_epf_test/func1 - baseclass_code interrupt_pin progif_code subsys_id - cache_line_size msi_interrupts revid subsys_vendorid - deviceid msix_interrupts subclass_code vendorid - -The PCI endpoint function driver populates these entries with default values -when the device is bound to the driver. The pci-epf-test driver populates -vendorid with 0xffff and interrupt_pin with 0x0001 - - # cat functions/pci_epf_test/func1/vendorid - 0xffff - # cat functions/pci_epf_test/func1/interrupt_pin - 0x0001 - -1.4 Configuring pci-epf-test Device - -The user can configure the pci-epf-test device using configfs entry. In order -to change the vendorid and the number of MSI interrupts used by the function -device, the following commands can be used. - - # echo 0x104c > functions/pci_epf_test/func1/vendorid - # echo 0xb500 > functions/pci_epf_test/func1/deviceid - # echo 16 > functions/pci_epf_test/func1/msi_interrupts - # echo 8 > functions/pci_epf_test/func1/msix_interrupts - -1.5 Binding pci-epf-test Device to EP Controller - -In order for the endpoint function device to be useful, it has to be bound to -a PCI endpoint controller driver. Use the configfs to bind the function -device to one of the controller driver present in the system. - - # ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/ - -Once the above step is completed, the PCI endpoint is ready to establish a link -with the host. - -1.6 Start the Link - -In order for the endpoint device to establish a link with the host, the _start_ -field should be populated with '1'. - - # echo 1 > controllers/51000000.pcie_ep/start - -2. RootComplex Device - -2.1 lspci Output - -Note that the devices listed here correspond to the value populated in 1.4 above - - 00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01) - 01:00.0 Unassigned class [ff00]: Texas Instruments Device b500 - -2.2 Using Endpoint Test function Device - -pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint -tests. To compile this tool the following commands should be used: - - # cd - # make -C tools/pci - -or if you desire to compile and install in your system: - - # cd - # make -C tools/pci install - -The tool and script will be located in /usr/bin/ - -2.2.1 pcitest.sh Output - # pcitest.sh - BAR tests - - BAR0: OKAY - BAR1: OKAY - BAR2: OKAY - BAR3: OKAY - BAR4: NOT OKAY - BAR5: NOT OKAY - - Interrupt tests - - SET IRQ TYPE TO LEGACY: OKAY - LEGACY IRQ: NOT OKAY - SET IRQ TYPE TO MSI: OKAY - MSI1: OKAY - MSI2: OKAY - MSI3: OKAY - MSI4: OKAY - MSI5: OKAY - MSI6: OKAY - MSI7: OKAY - MSI8: OKAY - MSI9: OKAY - MSI10: OKAY - MSI11: OKAY - MSI12: OKAY - MSI13: OKAY - MSI14: OKAY - MSI15: OKAY - MSI16: OKAY - MSI17: NOT OKAY - MSI18: NOT OKAY - MSI19: NOT OKAY - MSI20: NOT OKAY - MSI21: NOT OKAY - MSI22: NOT OKAY - MSI23: NOT OKAY - MSI24: NOT OKAY - MSI25: NOT OKAY - MSI26: NOT OKAY - MSI27: NOT OKAY - MSI28: NOT OKAY - MSI29: NOT OKAY - MSI30: NOT OKAY - MSI31: NOT OKAY - MSI32: NOT OKAY - SET IRQ TYPE TO MSI-X: OKAY - MSI-X1: OKAY - MSI-X2: OKAY - MSI-X3: OKAY - MSI-X4: OKAY - MSI-X5: OKAY - MSI-X6: OKAY - MSI-X7: OKAY - MSI-X8: OKAY - MSI-X9: NOT OKAY - MSI-X10: NOT OKAY - MSI-X11: NOT OKAY - MSI-X12: NOT OKAY - MSI-X13: NOT OKAY - MSI-X14: NOT OKAY - MSI-X15: NOT OKAY - MSI-X16: NOT OKAY - [...] - MSI-X2047: NOT OKAY - MSI-X2048: NOT OKAY - - Read Tests - - SET IRQ TYPE TO MSI: OKAY - READ ( 1 bytes): OKAY - READ ( 1024 bytes): OKAY - READ ( 1025 bytes): OKAY - READ (1024000 bytes): OKAY - READ (1024001 bytes): OKAY - - Write Tests - - WRITE ( 1 bytes): OKAY - WRITE ( 1024 bytes): OKAY - WRITE ( 1025 bytes): OKAY - WRITE (1024000 bytes): OKAY - WRITE (1024001 bytes): OKAY - - Copy Tests - - COPY ( 1 bytes): OKAY - COPY ( 1024 bytes): OKAY - COPY ( 1025 bytes): OKAY - COPY (1024000 bytes): OKAY - COPY (1024001 bytes): OKAY -- cgit From 8e6c8aa3b52e573a6db1f58b2ba9a9106a012963 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Apr 2019 22:45:09 +0200 Subject: isdn: gigaset: remove i4l support isdn4linux is getting removed, and the gigaset driver can still use the CAPI support, so this can all go away. Signed-off-by: Arnd Bergmann --- Documentation/isdn/README.gigaset | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) (limited to 'Documentation') diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset index 9b1ce277ca3d..f6184b637182 100644 --- a/Documentation/isdn/README.gigaset +++ b/Documentation/isdn/README.gigaset @@ -48,9 +48,8 @@ GigaSet 307x Device Driver 1.2. Software -------- - The driver works with the Kernel CAPI subsystem as well as the old - ISDN4Linux subsystem, so it can be used with any software which is able - to use CAPI 2.0 or ISDN4Linux for ISDN connections (voice or data). + The driver works with the Kernel CAPI subsystem and can be used with any + software which is able to use CAPI 2.0 for ISDN connections (voice or data). There are some user space tools available at https://sourceforge.net/projects/gigaset307x/ @@ -92,7 +91,7 @@ GigaSet 307x Device Driver gigaset debug debug level (see section 3.2.) startmode initial operation mode (see section 2.5.): - bas_gigaset ) 1=ISDN4linux/CAPI (default), 0=Unimodem + bas_gigaset ) 1=CAPI (default), 0=Unimodem ser_gigaset ) usb_gigaset ) cidmode initial Call-ID mode setting (see section 2.5.): 1=on (default), 0=off @@ -154,18 +153,10 @@ GigaSet 307x Device Driver 2.3. CAPI ---- - If the driver is compiled with CAPI support (kernel configuration option - GIGASET_CAPI) the devices will show up as CAPI controllers as soon as the - corresponding driver module is loaded, and can then be used with CAPI 2.0 - kernel and user space applications. For user space access, the module - capi.ko must be loaded. - - Legacy ISDN4Linux applications are supported via the capidrv - compatibility driver. The kernel module capidrv.ko must be loaded - explicitly with the command - modprobe capidrv - if needed, and cannot be unloaded again without unloading the driver - first. (These are limitations of capidrv.) + The devices will show up as CAPI controllers as soon as the + corresponding driver module is loaded, and can then be used with + CAPI 2.0 kernel and user space applications. For user space access, + the module capi.ko must be loaded. Most distributions handle loading and unloading of the various CAPI modules automatically via the command capiinit(1) from the capi4k-utils @@ -173,16 +164,6 @@ GigaSet 307x Device Driver Gigaset drivers because it doesn't support more than one module per driver. -2.4. ISDN4Linux - ---------- - If the driver is compiled without CAPI support (native ISDN4Linux - variant), it registers the device with the legacy ISDN4Linux subsystem - after loading the module. It can then be used with ISDN4Linux - applications only. Most distributions provide some configuration utility - for setting up that subsystem. Otherwise you can use some HOWTOs like - http://www.linuxhaven.de/dlhp/HOWTO/DE-ISDN-HOWTO-5.html - - 2.5. Unimodem mode ------------- In this mode the device works like a modem connected to a serial port @@ -281,8 +262,7 @@ GigaSet 307x Device Driver number. Dialing "***" (three asterisks) calls all extensions simultaneously (global call). - This holds for both CAPI 2.0 and ISDN4Linux applications. Unimodem mode - does not support internal calls. + Unimodem mode does not support internal calls. 2.8. Unregistered Wireless Devices (M101/M105) ----------------------------------------- -- cgit From 85993b8c9786fb24975dbcabebb1c75790d4fb6a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Apr 2019 22:47:35 +0200 Subject: isdn: remove hisax driver With the decline of ISDN, this seems to have become almost completely obsolete, and even in the past years before that, almost all remaining users appear to have used mISDN instead. Birger Harzenetter noted that he is still using i4l/hisax to take advantage of the 'divert' driver for call diversion, but otherwise uses mISDN on the same hardware. This is a rare edge case as far as I can tell, but we are still breaking an actively used work flow (see https://xkcd.com/1172/). We debated moving i4l/hisax to staging as an intermediate step, but as he is not likely to change the setup, and that would just delay breaking this use case. The alternatives here are to stay on stable kernels < 5.2, to create an external driver repository for isdn4linux, or to add divert functionality to mISDN. Cc: Birger Harzenetter Signed-off-by: Arnd Bergmann --- Documentation/isdn/HiSax.cert | 96 ------ Documentation/isdn/README.HiSax | 659 ---------------------------------------- 2 files changed, 755 deletions(-) delete mode 100644 Documentation/isdn/HiSax.cert delete mode 100644 Documentation/isdn/README.HiSax (limited to 'Documentation') diff --git a/Documentation/isdn/HiSax.cert b/Documentation/isdn/HiSax.cert deleted file mode 100644 index f2a6fcb8efee..000000000000 --- a/Documentation/isdn/HiSax.cert +++ /dev/null @@ -1,96 +0,0 @@ ------BEGIN PGP SIGNED MESSAGE----- - -First: - - HiSax is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - -However, if you wish to modify the HiSax sources, please note the following: - -HiSax has passed the ITU approval test suite with ELSA Quickstep ISDN cards -and Eicon Technology Diva 2.01 PCI card. -The certification is only valid for the combination of the tested software -version and the tested hardware. Any changes to the HiSax source code may -therefore affect the certification. - -Additional ITU approval tests have been carried out for all generic cards -using Colognechip single chip solutions HFC-S PCI A for PCI cards as well -as HFC-S USB based USB ISDN ta adapters. -These tests included all layers 1-3 and as well all functional tests for -the layer 1. Because all hardware based on these chips are complete ISDN -solutions in one chip all cards and USB-TAs using these chips are to be -regarded as approved for those tests. Some additional electrical tests -of the layer 1 which are independent of the driver and related to a -special hardware used will be regarded as approved if at least one -solution has been tested including those electrical tests. So if cards -or tas have been completely approved for any other os, the approval -for those electrical tests is valid for linux, too. -Please send any questions regarding this drivers or approval abouts to -werner@isdn-development.de -Additional information and the type approval documents will be found -shortly on the Colognechip website www.colognechip.com - -If you change the main files of the HiSax ISDN stack, the certification will -become invalid. Because in most countries it is illegal to connect -unapproved ISDN equipment to the public network, I have to guarantee that -changes in HiSax do not affect the certification. - -In order to make a valid certification apparent to the user, I have built in -some validation checks that are made during the make process. The HiSax main -files are protected by md5 checksums and the md5sum file is pgp signed by -myself: - -KeyID 1024/FF992F6D 1997/01/16 Karsten Keil -Key fingerprint = 92 6B F7 58 EE 86 28 C8 C4 1A E6 DC 39 89 F2 AA - -Only if the checksums are OK, and the signature of the file -"drivers/isdn/hisax/md5sums.asc" match, is the certification valid; a -message confirming this is then displayed during the hisax init process. - -The affected files are: - -drivers/isdn/hisax/isac.c -drivers/isdn/hisax/isdnl1.c -drivers/isdn/hisax/isdnl2.c -drivers/isdn/hisax/isdnl3.c -drivers/isdn/hisax/tei.c -drivers/isdn/hisax/callc.c -drivers/isdn/hisax/l3dss1.c -drivers/isdn/hisax/l3_1tr6.c -drivers/isdn/hisax/cert.c -drivers/isdn/hisax/elsa.c -drivers/isdn/hisax/diva.c -drivers/isdn/hisax/hfc_pci.c - -Please send any changes, bugfixes and patches to me rather than implementing -them directly into the HiSax sources. - -This does not reduce your rights granted by the GNU General Public License. -If you wish to change the sources, go ahead; but note that then the -certification is invalid even if you use one of the approved cards. - -Here are the certification registration numbers for ELSA Quickstep cards: -German D133361J CETECOM ICT Services GmbH 0682 -European D133362J CETECOM ICT Services GmbH 0682 - - -Karsten Keil -keil@isdn4linux.de - ------BEGIN PGP SIGNATURE----- -Version: 2.6.3i -Charset: noconv - -iQCVAwUBOFAwqTpxHvX/mS9tAQFI2QP9GLDK2iy/KBhwReE3F7LeO+tVhffTVZ3a -20q5/z/WcIg/pnH0uTkl2UgDXBFXYl45zJyDGNpAposIFmT+Edd14o7Vj1w/BBdn -Y+5rBmJf+gyBu61da5d6bv0lpymwRa/um+ri+ilYnZ/XPfg5JKhdjGSBCJuJAElM -d2jFbTrsMYw= -=LNf9 ------END PGP SIGNATURE----- diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax deleted file mode 100644 index b1a573cf4472..000000000000 --- a/Documentation/isdn/README.HiSax +++ /dev/null @@ -1,659 +0,0 @@ -HiSax is a Linux hardware-level driver for passive ISDN cards with Siemens -chipset (ISAC_S 2085/2086/2186, HSCX SAB 82525). It is based on the Teles -driver from Jan den Ouden. -It is meant to be used with isdn4linux, an ISDN link-level module for Linux -written by Fritz Elfert. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - -Supported cards ---------------- - -Teles 8.0/16.0/16.3 and compatible ones -Teles 16.3c -Teles S0/PCMCIA -Teles PCI -Teles S0Box -Creatix S0Box -Creatix PnP S0 -Compaq ISDN S0 ISA card -AVM A1 (Fritz, Teledat 150) -AVM Fritz PCMCIA -AVM Fritz PnP -AVM Fritz PCI -ELSA Microlink PCC-16, PCF, PCF-Pro, PCC-8 -ELSA Quickstep 1000 -ELSA Quickstep 1000PCI -ELSA Quickstep 3000 (same settings as QS1000) -ELSA Quickstep 3000PCI -ELSA PCMCIA -ITK ix1-micro Rev.2 -Eicon Diva 2.0 ISA and PCI (S0 and U interface, no PRO version) -Eicon Diva 2.01 ISA and PCI -Eicon Diva 2.02 PCI -Eicon Diva Piccola -ASUSCOM NETWORK INC. ISDNLink 128K PC adapter (order code I-IN100-ST-D) -Dynalink IS64PH (OEM version of ASUSCOM NETWORK INC. ISDNLink 128K adapter) -PCBIT-DP (OEM version of ASUSCOM NETWORK INC. ISDNLink) -HFC-2BS0 based cards (TeleInt SA1) -Sedlbauer Speed Card (Speed Win, Teledat 100, PCI, Fax+) -Sedlbauer Speed Star/Speed Star2 (PCMCIA) -Sedlbauer ISDN-Controller PC/104 -USR Sportster internal TA (compatible Stollmann tina-pp V3) -USR internal TA PCI -ith Kommunikationstechnik GmbH MIC 16 ISA card -Traverse Technologie NETjet PCI S0 card and NETspider U card -Ovislink ISDN sc100-p card (NETjet driver) -Dr. Neuhaus Niccy PnP/PCI -Siemens I-Surf 1.0 -Siemens I-Surf 2.0 (with IPAC, try type 12 asuscom) -ACER P10 -HST Saphir -Berkom Telekom A4T -Scitel Quadro -Gazel ISDN cards -HFC-PCI based cards -Winbond W6692 based cards -HFC-S+, HFC-SP/PCMCIA cards -formula-n enternow -Gerdes Power ISDN - -Note: PCF, PCF-Pro: up to now, only the ISDN part is supported - PCC-8: not tested yet - Eicon.Diehl Diva U interface not tested - -If you know other passive cards with the Siemens chipset, please let me know. -You can combine any card, if there is no conflict between the resources -(io, mem, irq). - - -Configuring the driver ----------------------- - -The HiSax driver can either be built directly into the kernel or as a module. -It can be configured using the command line feature while loading the kernel -with LILO or LOADLIN or, if built as a module, using insmod/modprobe with -parameters. -There is also some config needed before you compile the kernel and/or -modules. It is included in the normal "make [menu]config" target at the -kernel. Don't forget it, especially to select the right D-channel protocol. - -Please note: In older versions of the HiSax driver, all PnP cards -needed to be configured with isapnp and worked only with the HiSax -driver used as a module. - -In the current version, HiSax will automatically use the in-kernel -ISAPnP support, provided you selected it during kernel configuration -(CONFIG_ISAPNP), if you don't give the io=, irq= command line parameters. - -The affected card types are: 4,7,12,14,19,27-30 - -a) when built as a module -------------------------- - -insmod/modprobe hisax.o \ - io=iobase irq=IRQ mem=membase type=card_type \ - protocol=D_channel_protocol id=idstring - -or, if several cards are installed: - -insmod/modprobe hisax.o \ - io=iobase1,iobase2,... irq=IRQ1,IRQ2,... mem=membase1,membase2,... \ - type=card_type1,card_type2,... \ - protocol=D_channel_protocol1,D_channel_protocol2,... \ - id=idstring1%idstring2 ... - -where "iobaseN" represents the I/O base address of the Nth card, "membaseN" -the memory base address of the Nth card, etc. - -The reason for the delimiter "%" being used in the idstrings is that "," -won't work with the current modules package. - -The parameters may be specified in any order. For example, the "io" -parameter may precede the "irq" parameter, or vice versa. If several -cards are installed, the ordering within the comma separated parameter -lists must of course be consistent. - -Only parameters applicable to the card type need to be specified. For -example, the Teles 16.3 card is not memory-mapped, so the "mem" -parameter may be omitted for this card. Sometimes it may be necessary -to specify a dummy parameter, however. This is the case when there is -a card of a different type later in the list that needs a parameter -which the preceding card does not. For instance, if a Teles 16.0 card -is listed after a Teles 16.3 card, a dummy memory base parameter of 0 -must be specified for the 16.3. Instead of a dummy value, the parameter -can also be skipped by simply omitting the value. For example: -mem=,0xd0000. See example 6 below. - -The parameter for the D-Channel protocol may be omitted if you selected the -correct one during kernel config. Valid values are "1" for German 1TR6, -"2" for EDSS1 (Euro ISDN), "3" for leased lines (no D-Channel) and "4" -for US NI1. -With US NI1 you have to include your SPID into the MSN setting in the form -: for example (your phonenumber is 1234 your SPID 5678): -AT&E1234:5678 on ttyI interfaces -isdnctrl eaz ippp0 1234:5678 on network devices - -The Creatix/Teles PnP cards use io1= and io2= instead of io= for specifying -the I/O addresses of the ISAC and HSCX chips, respectively. - -Card types: - - Type Required parameters (in addition to type and protocol) - - 1 Teles 16.0 irq, mem, io - 2 Teles 8.0 irq, mem - 3 Teles 16.3 (non PnP) irq, io - 4 Creatix/Teles PnP irq, io0 (ISAC), io1 (HSCX) - 5 AVM A1 (Fritz) irq, io - 6 ELSA PCC/PCF cards io or nothing for autodetect (the iobase is - required only if you have more than one ELSA - card in your PC) - 7 ELSA Quickstep 1000 irq, io (from isapnp setup) - 8 Teles 16.3 PCMCIA irq, io - 9 ITK ix1-micro Rev.2 irq, io - 10 ELSA PCMCIA irq, io (set with card manager) - 11 Eicon.Diehl Diva ISA PnP irq, io - 11 Eicon.Diehl Diva PCI no parameter - 12 ASUS COM ISDNLink irq, io (from isapnp setup) - 13 HFC-2BS0 based cards irq, io - 14 Teles 16.3c PnP irq, io - 15 Sedlbauer Speed Card irq, io - 15 Sedlbauer PC/104 irq, io - 15 Sedlbauer Speed PCI no parameter - 16 USR Sportster internal irq, io - 17 MIC card irq, io - 18 ELSA Quickstep 1000PCI no parameter - 19 Compaq ISDN S0 ISA card irq, io0, io1, io (from isapnp setup io=IO2) - 20 NETjet PCI card no parameter - 21 Teles PCI no parameter - 22 Sedlbauer Speed Star (PCMCIA) irq, io (set with card manager) - 24 Dr. Neuhaus Niccy PnP irq, io0, io1 (from isapnp setup) - 24 Dr. Neuhaus Niccy PCI no parameter - 25 Teles S0Box irq, io (of the used lpt port) - 26 AVM A1 PCMCIA (Fritz!) irq, io (set with card manager) - 27 AVM PnP (Fritz!PnP) irq, io (from isapnp setup) - 27 AVM PCI (Fritz!PCI) no parameter - 28 Sedlbauer Speed Fax+ irq, io (from isapnp setup) - 29 Siemens I-Surf 1.0 irq, io, memory (from isapnp setup) - 30 ACER P10 irq, io (from isapnp setup) - 31 HST Saphir irq, io - 32 Telekom A4T none - 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4) - 34 Gazel ISDN cards (ISA) irq,io - 34 Gazel ISDN cards (PCI) none - 35 HFC 2BDS0 PCI none - 36 W6692 based PCI cards none - 37 HFC 2BDS0 S+, SP irq,io - 38 NETspider U PCI card none - 39 HFC 2BDS0 SP/PCMCIA irq,io (set with cardmgr) - 40 hotplug interface - 41 Formula-n enter:now PCI none - -At the moment IRQ sharing is only possible with PCI cards. Please make sure -that your IRQ is free and enabled for ISA use. - - -Examples for module loading - -1. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 10 - modprobe hisax type=3 protocol=2 io=0x280 irq=10 - -2. Teles 16.0, 1TR6 ISDN, I/O base d80 hex, IRQ 5, Memory d0000 hex - modprobe hisax protocol=1 type=1 io=0xd80 mem=0xd0000 irq=5 - -3. Fritzcard, Euro ISDN, I/O base 340 hex, IRQ 10 and ELSA PCF, Euro ISDN - modprobe hisax type=5,6 protocol=2,2 io=0x340 irq=10 id=Fritz%Elsa - -4. Any ELSA PCC/PCF card, Euro ISDN - modprobe hisax type=6 protocol=2 - -5. Teles 16.3 PnP, Euro ISDN, with isapnp configured - isapnp config: (INT 0 (IRQ 10 (MODE +E))) - (IO 0 (BASE 0x0580)) - (IO 1 (BASE 0x0180)) - modprobe hisax type=4 protocol=2 irq=10 io0=0x580 io1=0x180 - - In the current version of HiSax, you can instead simply use - - modprobe hisax type=4 protocol=2 - - if you configured your kernel for ISAPnP. Don't run isapnp in - this case! - -6. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 12 and - Teles 16.0, 1TR6, IRQ 5, Memory d0000 hex - modprobe hisax type=3,1 protocol=2,1 io=0x280 mem=0,0xd0000 - - Please note the dummy 0 memory address for the Teles 16.3, used as a - placeholder as described above, in the last example. - -7. Teles PCMCIA, Euro ISDN, I/O base 180 hex, IRQ 15 (default values) - modprobe hisax type=8 protocol=2 io=0x180 irq=15 - - -b) using LILO/LOADLIN, with the driver compiled directly into the kernel ------------------------------------------------------------------------- - -hisax=typ1,dp1,pa_1,pb_1,pc_1[,typ2,dp2,pa_2 ... \ - typn,dpn,pa_n,pb_n,pc_n][,idstring1[,idstring2,...,idstringn]] - -where - typ1 = type of 1st card (default depends on kernel settings) - dp1 = D-Channel protocol of 1st card. 1=1TR6, 2=EDSS1, 3=leased - pa_1 = 1st parameter (depending on the type of the card) - pb_1 = 2nd parameter ( " " " " " " " ) - pc_1 = 3rd parameter ( " " " " " " " ) - - typ2,dp2,pa_2,pb_2,pc_2 = Parameters of the second card (defaults: none) - typn,dpn,pa_n,pb_n,pc_n = Parameters of the n'th card (up to 16 cards are - supported) - - idstring = Driver ID for accessing the particular card with utility - programs and for identification when using a line monitor - (default: "HiSax") - - Note: the ID string must start with an alphabetical character! - -Card types: - -type - 1 Teles 16.0 pa=irq pb=membase pc=iobase - 2 Teles 8.0 pa=irq pb=membase - 3 Teles 16.3 pa=irq pb=iobase - 4 Creatix/Teles PNP ONLY WORKS AS A MODULE ! - 5 AVM A1 (Fritz) pa=irq pb=iobase - 6 ELSA PCC/PCF cards pa=iobase or nothing for autodetect - 7 ELSA Quickstep 1000 ONLY WORKS AS A MODULE ! - 8 Teles S0 PCMCIA pa=irq pb=iobase - 9 ITK ix1-micro Rev.2 pa=irq pb=iobase - 10 ELSA PCMCIA pa=irq, pb=io (set with card manager) - 11 Eicon.Diehl Diva ISAPnP ONLY WORKS AS A MODULE ! - 11 Eicon.Diehl Diva PCI no parameter - 12 ASUS COM ISDNLink ONLY WORKS AS A MODULE ! - 13 HFC-2BS0 based cards pa=irq pb=io - 14 Teles 16.3c PnP ONLY WORKS AS A MODULE ! - 15 Sedlbauer Speed Card pa=irq pb=io (Speed Win only as module !) - 15 Sedlbauer PC/104 pa=irq pb=io - 15 Sedlbauer Speed PCI no parameter - 16 USR Sportster internal pa=irq pb=io - 17 MIC card pa=irq pb=io - 18 ELSA Quickstep 1000PCI no parameter - 19 Compaq ISDN S0 ISA card ONLY WORKS AS A MODULE ! - 20 NETjet PCI card no parameter - 21 Teles PCI no parameter - 22 Sedlbauer Speed Star (PCMCIA) pa=irq, pb=io (set with card manager) - 24 Dr. Neuhaus Niccy PnP ONLY WORKS AS A MODULE ! - 24 Dr. Neuhaus Niccy PCI no parameter - 25 Teles S0Box pa=irq, pb=io (of the used lpt port) - 26 AVM A1 PCMCIA (Fritz!) pa=irq, pb=io (set with card manager) - 27 AVM PnP (Fritz!PnP) ONLY WORKS AS A MODULE ! - 27 AVM PCI (Fritz!PCI) no parameter - 28 Sedlbauer Speed Fax+ ONLY WORKS AS A MODULE ! - 29 Siemens I-Surf 1.0 ONLY WORKS AS A MODULE ! - 30 ACER P10 ONLY WORKS AS A MODULE ! - 31 HST Saphir pa=irq, pb=io - 32 Telekom A4T no parameter - 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4) - 34 Gazel ISDN cards (ISA) pa=irq, pb=io - 34 Gazel ISDN cards (PCI) no parameter - 35 HFC 2BDS0 PCI no parameter - 36 W6692 based PCI cards none - 37 HFC 2BDS0 S+,SP/PCMCIA ONLY WORKS AS A MODULE ! - 38 NETspider U PCI card none - 39 HFC 2BDS0 SP/PCMCIA ONLY WORKS AS A MODULE ! - 40 hotplug interface ONLY WORKS AS A MODULE ! - 41 Formula-n enter:now PCI none - -Running the driver ------------------- - -When you insmod isdn.o and hisax.o (or with the in-kernel version, during -boot time), a few lines should appear in your syslog. Look for something like: - -Apr 13 21:01:59 kke01 kernel: HiSax: Driver for Siemens chip set ISDN cards -Apr 13 21:01:59 kke01 kernel: HiSax: Version 2.9 -Apr 13 21:01:59 kke01 kernel: HiSax: Revisions 1.14/1.9/1.10/1.25/1.8 -Apr 13 21:01:59 kke01 kernel: HiSax: Total 1 card defined -Apr 13 21:01:59 kke01 kernel: HiSax: Card 1 Protocol EDSS1 Id=HiSax1 (0) -Apr 13 21:01:59 kke01 kernel: HiSax: Elsa driver Rev. 1.13 -... -Apr 13 21:01:59 kke01 kernel: Elsa: PCF-Pro found at 0x360 Rev.:C IRQ 10 -Apr 13 21:01:59 kke01 kernel: Elsa: timer OK; resetting card -Apr 13 21:01:59 kke01 kernel: Elsa: HSCX version A: V2.1 B: V2.1 -Apr 13 21:01:59 kke01 kernel: Elsa: ISAC 2086/2186 V1.1 -... -Apr 13 21:01:59 kke01 kernel: HiSax: DSS1 Rev. 1.14 -Apr 13 21:01:59 kke01 kernel: HiSax: 2 channels added - -This means that the card is ready for use. -Cabling problems or line-downs are not detected, and only some ELSA cards can -detect the S0 power. - -Remember that, according to the new strategy for accessing low-level drivers -from within isdn4linux, you should also define a driver ID while doing -insmod: Simply append hisax_id= to the insmod command line. This -string MUST NOT start with a digit or a small 'x'! - -At this point you can run a 'cat /dev/isdnctrl0' and view debugging messages. - -At the moment, debugging messages are enabled with the hisaxctrl tool: - - hisaxctrl DebugCmd - - default is HiSax, if you didn't specify one. - -DebugCmd is 1 for generic debugging - 11 for layer 1 development debugging - 13 for layer 3 development debugging - -where is the integer sum of the following debugging -options you wish enabled: - -With DebugCmd set to 1: - - 0x0001 Link-level <--> hardware-level communication - 0x0002 Top state machine - 0x0004 D-Channel Frames for isdnlog - 0x0008 D-Channel Q.921 - 0x0010 B-Channel X.75 - 0x0020 D-Channel l2 - 0x0040 B-Channel l2 - 0x0080 D-Channel link state debugging - 0x0100 B-Channel link state debugging - 0x0200 TEI debug - 0x0400 LOCK debug in callc.c - 0x0800 More paranoid debug in callc.c (not for normal use) - 0x1000 D-Channel l1 state debugging - 0x2000 B-Channel l1 state debugging - -With DebugCmd set to 11: - - 0x0001 Warnings (default: on) - 0x0002 IRQ status - 0x0004 ISAC - 0x0008 ISAC FIFO - 0x0010 HSCX - 0x0020 HSCX FIFO (attention: full B-Channel output!) - 0x0040 D-Channel LAPD frame types - 0x0080 IPAC debug - 0x0100 HFC receive debug - 0x0200 ISAC monitor debug - 0x0400 D-Channel frames for isdnlog (set with 1 0x4 too) - 0x0800 D-Channel message verbose - -With DebugCmd set to 13: - - 1 Warnings (default: on) - 2 l3 protocol descriptor errors - 4 l3 state machine - 8 charge info debugging (1TR6) - -For example, 'hisaxctrl HiSax 1 0x3ff' enables full generic debugging. - -Because of some obscure problems with some switch equipment, the delay -between the CONNECT message and sending the first data on the B-channel is now -configurable with - -hisaxctrl 2 - in ms Value between 50 and 800 ms is recommended. - -Downloading Firmware --------------------- -At the moment, the Sedlbauer speed fax+ is the only card, which -needs to download firmware. -The firmware is downloaded with the hisaxctrl tool: - - hisaxctrl 9 - - default is HiSax, if you didn't specify one, - -where is the filename of the firmware file. - -For example, 'hisaxctrl HiSax 9 ISAR.BIN' downloads the firmware for -ISAR based cards (like the Sedlbauer speed fax+). - -Warning -------- -HiSax is a work in progress and may crash your machine. -For certification look at HiSax.cert file. - -Limitations ------------ -At this time, HiSax only works on Euro ISDN lines and German 1TR6 lines. -For leased lines see appendix. - -Bugs ----- -If you find any, please let me know. - - -Thanks ------- -Special thanks to: - - Emil Stephan for the name HiSax which is a mix of HSCX and ISAC. - - Fritz Elfert, Jan den Ouden, Michael Hipp, Michael Wein, - Andreas Kool, Pekka Sarnila, Sim Yskes, Johan Myrre'en, - Klaus-Peter Nischke (ITK AG), Christof Petig, Werner Fehn (ELSA GmbH), - Volker Schmidt - Edgar Toernig and Marcus Niemann for the Sedlbauer driver - Stephan von Krawczynski - Juergen Quade for the Leased Line part - Klaus Lichtenwalder (Klaus.Lichtenwalder@WebForum.DE), for ELSA PCMCIA support - Enrik Berkhan (enrik@starfleet.inka.de) for S0BOX specific stuff - Ton van Rosmalen for Teles PCI - Petr Novak for Winbond W6692 support - Werner Cornelius for HFC-PCI, HFC-S(+/P) and supplementary services support - and more people who are hunting bugs. (If I forgot somebody, please - send me a mail). - - Firma ELSA GmbH - Firma Eicon.Diehl GmbH - Firma Dynalink NL - Firma ASUSCOM NETWORK INC. Taiwan - Firma S.u.S.E - Firma ith Kommunikationstechnik GmbH - Firma Traverse Technologie Australia - Firma Medusa GmbH (www.medusa.de). - Firma Quant-X Austria for sponsoring a DEC Alpha board+CPU - Firma Cologne Chip Designs GmbH - - My girl friend and partner in life Ute for her patience with me. - - -Enjoy, - -Karsten Keil -keil@isdn4linux.de - - -Appendix: Teles PCMCIA driver ------------------------------ - -See - http://www.linux.no/teles_cs.txt -for instructions. - -Appendix: Linux and ISDN-leased lines -------------------------------------- - -Original from Juergen Quade, new version KKe. - -Attention NEW VERSION, the old leased line syntax won't work !!! - -You can use HiSax to connect your Linux-Box via an ISDN leased line -to e.g. the Internet: - -1. Build a kernel which includes the HiSax driver either as a module - or as part of the kernel. - cd /usr/src/linux - make menuconfig - - make clean; make zImage; make modules; make modules_install -2. Install the new kernel - cp /usr/src/linux/arch/x86/boot/zImage /etc/kernel/linux.isdn - vi /etc/lilo.conf - - lilo -3. in case the hisax driver is a "fixed" part of the kernel, configure - the driver with lilo: - vi /etc/lilo.conf - - lilo - Your lilo.conf _might_ look like the following: - - # LILO configuration-file - # global section - # teles 16.0 on IRQ=5, MEM=0xd8000, PORT=0xd80 - append="hisax=1,3,5,0xd8000,0xd80,HiSax" - # teles 16.3 (non pnp) on IRQ=15, PORT=0xd80 - # append="hisax=3,3,5,0xd8000,0xd80,HiSax" - boot=/dev/sda - compact # faster, but won't work on all systems. - linear - read-only - prompt - timeout=100 - vga = normal # force sane state - # Linux bootable partition config begins - image = /etc/kernel/linux.isdn - root = /dev/sda1 - label = linux.isdn - # - image = /etc/kernel/linux-2.0.30 - root = /dev/sda1 - label = linux.secure - - In the line starting with "append" you have to adapt the parameters - according to your card (see above in this file) - -3. boot the new linux.isdn kernel -4. start the ISDN subsystem: - a) load - if necessary - the modules (depends, whether you compiled - the ISDN driver as module or not) - According to the type of card you have to specify the necessary - driver parameter (irq, io, mem, type, protocol). - For the leased line the protocol is "3". See the table above for - the parameters, which you have to specify depending on your card. - b) configure i4l - /sbin/isdnctrl addif isdn0 - # EAZ 1 -- B1 channel 2 --B2 channel - /sbin/isdnctrl eaz isdn0 1 - /sbin/isdnctrl secure isdn0 on - /sbin/isdnctrl huptimeout isdn0 0 - /sbin/isdnctrl l2_prot isdn0 hdlc - # Attention you must not set an outgoing number !!! This won't work !!! - # The incoming number is LEASED0 for the first card, LEASED1 for the - # second and so on. - /sbin/isdnctrl addphone isdn0 in LEASED0 - # Here is no need to bind the channel. - c) in case the remote partner is a CISCO: - /sbin/isdnctrl encap isdn0 cisco-h - d) configure the interface - /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP} - e) set the routes - /sbin/route add -host ${REMOTE_IP} isdn0 - /sbin/route add default gw ${REMOTE_IP} - f) switch the card into leased mode for each used B-channel - /sbin/hisaxctrl HiSax 5 1 - -Remarks: -a) Use state of the art isdn4k-utils - -Here an example script: -#!/bin/sh -# Start/Stop ISDN leased line connection - -I4L_AS_MODULE=yes -I4L_REMOTE_IS_CISCO=no -I4L_MODULE_PARAMS="type=16 io=0x268 irq=7 " -I4L_DEBUG=no -I4L_LEASED_128K=yes -LOCAL_IP=192.168.1.1 -REMOTE_IP=192.168.2.1 - -case "$1" in - start) - echo "Starting ISDN ..." - if [ ${I4L_AS_MODULE} = "yes" ]; then - echo "loading modules..." - /sbin/modprobe hisax ${I4L_MODULE_PARAMS} - fi - # configure interface - /sbin/isdnctrl addif isdn0 - /sbin/isdnctrl secure isdn0 on - if [ ${I4L_DEBUG} = "yes" ]; then - /sbin/isdnctrl verbose 7 - /sbin/hisaxctrl HiSax 1 0xffff - /sbin/hisaxctrl HiSax 11 0xff - cat /dev/isdnctrl >/tmp/lea.log & - fi - if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then - /sbin/isdnctrl encap isdn0 cisco-h - fi - /sbin/isdnctrl huptimeout isdn0 0 - # B-CHANNEL 1 - /sbin/isdnctrl eaz isdn0 1 - /sbin/isdnctrl l2_prot isdn0 hdlc - # 1. card - /sbin/isdnctrl addphone isdn0 in LEASED0 - if [ ${I4L_LEASED_128K} = "yes" ]; then - /sbin/isdnctrl addslave isdn0 isdn0s - /sbin/isdnctrl secure isdn0s on - /sbin/isdnctrl huptimeout isdn0s 0 - # B-CHANNEL 2 - /sbin/isdnctrl eaz isdn0s 2 - /sbin/isdnctrl l2_prot isdn0s hdlc - # 1. card - /sbin/isdnctrl addphone isdn0s in LEASED0 - if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then - /sbin/isdnctrl encap isdn0s cisco-h - fi - fi - /sbin/isdnctrl dialmode isdn0 manual - # configure tcp/ip - /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP} - /sbin/route add -host ${REMOTE_IP} isdn0 - /sbin/route add default gw ${REMOTE_IP} - # switch to leased mode - # B-CHANNEL 1 - /sbin/hisaxctrl HiSax 5 1 - if [ ${I4L_LEASED_128K} = "yes" ]; then - # B-CHANNEL 2 - sleep 10; /* Wait for master */ - /sbin/hisaxctrl HiSax 5 2 - fi - ;; - stop) - /sbin/ifconfig isdn0 down - /sbin/isdnctrl delif isdn0 - if [ ${I4L_DEBUG} = "yes" ]; then - killall cat - fi - if [ ${I4L_AS_MODULE} = "yes" ]; then - /sbin/rmmod hisax - /sbin/rmmod isdn - /sbin/rmmod ppp - /sbin/rmmod slhc - fi - ;; - *) - echo "Usage: $0 {start|stop}" - exit 1 -esac -exit 0 -- cgit From 9c3c0c2048149d946d7f3ebdcbe70e2946750bfb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Apr 2019 22:43:36 +0200 Subject: isdn: remove isdn4linux With all isdn4linux hardware drivers gone, this is only a wrapper around CAPI to support old user space. However, from looking at the mailing list, it seems that the last time anyone asked about it was in 2014, when the upgrade from a linux-2.4 installation failed, and mISDN was suggested as a replacement. The largest public ISDN network (Deutsche Telekom) was supposed to be shut down 2018, which must have drastically reduced the number of legacy installations. When we last discussed removing i4l in 2016, Karsten Keil suggested revisiting this in 2018. I guess this is overdue. Link: http://listserv.isdn4linux.de/pipermail/isdn4linux/2014-October/006165.html Link: https://patchwork.kernel.org/patch/8484861/#17900371 Link: https://listserv.isdn4linux.de/pipermail/isdn4linux/2019-April/thread.html Signed-off-by: Arnd Bergmann --- Documentation/isdn/INTERFACE | 759 ------------------------------------ Documentation/isdn/INTERFACE.fax | 163 -------- Documentation/isdn/README | 599 ---------------------------- Documentation/isdn/README.FAQ | 26 -- Documentation/isdn/README.audio | 138 ------- Documentation/isdn/README.concap | 259 ------------ Documentation/isdn/README.diversion | 127 ------ Documentation/isdn/README.fax | 45 --- Documentation/isdn/README.hfc-pci | 41 -- Documentation/isdn/README.syncppp | 58 --- Documentation/isdn/README.x25 | 184 --------- Documentation/isdn/syncPPP.FAQ | 224 ----------- Documentation/process/changes.rst | 16 +- 13 files changed, 2 insertions(+), 2637 deletions(-) delete mode 100644 Documentation/isdn/INTERFACE delete mode 100644 Documentation/isdn/INTERFACE.fax delete mode 100644 Documentation/isdn/README delete mode 100644 Documentation/isdn/README.FAQ delete mode 100644 Documentation/isdn/README.audio delete mode 100644 Documentation/isdn/README.concap delete mode 100644 Documentation/isdn/README.diversion delete mode 100644 Documentation/isdn/README.fax delete mode 100644 Documentation/isdn/README.hfc-pci delete mode 100644 Documentation/isdn/README.syncppp delete mode 100644 Documentation/isdn/README.x25 delete mode 100644 Documentation/isdn/syncPPP.FAQ (limited to 'Documentation') diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE deleted file mode 100644 index 5df17e5b25c8..000000000000 --- a/Documentation/isdn/INTERFACE +++ /dev/null @@ -1,759 +0,0 @@ -$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $ - -Description of the Interface between Linklevel and Hardwarelevel - of isdn4linux: - - - The Communication between Linklevel (LL) and Hardwarelevel (HL) - is based on the struct isdn_if (defined in isdnif.h). - - An HL-driver can register itself at LL by calling the function - register_isdn() with a pointer to that struct. Prior to that, it has - to preset some of the fields of isdn_if. The LL sets the rest of - the fields. All further communication is done via callbacks using - the function-pointers defined in isdn_if. - - Changes/Version numbering: - - During development of the ISDN subsystem, several changes have been - made to the interface. Before it went into kernel, the package - had a unique version number. The last version, distributed separately - was 0.7.4. When the subsystem went into kernel, every functional unit - got a separate version number. These numbers are shown at initialization, - separated by slashes: - - c.c/t.t/n.n/p.p/a.a/v.v - - where - - c.c is the revision of the common code. - t.t is the revision of the tty related code. - n.n is the revision of the network related code. - p.p is the revision of the ppp related code. - a.a is the revision of the audio related code. - v.v is the revision of the V.110 related code. - - Changes in this document are marked with '***CHANGEx' where x representing - the version number. If that number starts with 0, it refers to the old, - separately distributed package. If it starts with one of the letters - above, it refers to the revision of the corresponding module. - ***CHANGEIx refers to the revision number of the isdnif.h - -1. Description of the fields of isdn_if: - - int channels; - - This field has to be set by the HL-driver to the number of channels - supported prior to calling register_isdn(). Upon return of the call, - the LL puts an id there, which has to be used by the HL-driver when - invoking the other callbacks. - - int maxbufsize; - - ***CHANGE0.6: New since this version. - - Also to be preset by the HL-driver. With this value the HL-driver - tells the LL the maximum size of a data-packet it will accept. - - unsigned long features; - - To be preset by the HL-driver. Using this field, the HL-driver - announces the features supported. At the moment this is limited to - report the supported layer2 and layer3-protocols. For setting this - field the constants ISDN_FEATURE..., declared in isdnif.h have to be - used. - - ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set. - - unsigned short hl_hdrlen; - - ***CHANGE0.7.4: New field. - - To be preset by the HL-driver, if it supports sk_buff's. The driver - should put here the amount of additional space needed in sk_buff's for - its internal purposes. Drivers not supporting sk_buff's should - initialize this field to 0. - - void (*rcvcallb_skb)(int, int, struct sk_buff *) - - ***CHANGE0.7.4: New field. - - This field will be set by LL. The HL-driver delivers received data- - packets by calling this function. Upon calling, the HL-driver must - already have its private data pulled off the head of the sk_buff. - - Parameter: - int driver-Id - int Channel-number locally to the driver. (starting with 0) - struct sk_buff * Pointer to sk_buff, containing received data. - - int (*statcallb)(isdn_ctrl*); - - This field will be set by LL. This function has to be called by the - HL-driver for signaling status-changes or other events to the LL. - - Parameter: - isdn_ctrl* - - The struct isdn_ctrl also defined in isdn_if. The exact meanings of its - fields are described together with the descriptions of the possible - events. Here is only a short description of the fields: - - driver = driver Id. - command = event-type. (one of the constants ISDN_STAT_...) - arg = depends on event-type. - num = depends on event-type. - - Returnvalue: - 0 on success, else -1 - - int (*command)(isdn_ctrl*); - - This field has to be preset by the HL-driver. It points to a function, - to be called by LL to perform functions like dialing, B-channel - setup, etc. The exact meaning of the parameters is described with the - descriptions of the possible commands. - - Parameter: - isdn_ctrl* - driver = driver-Id - command = command to perform. (one of the constants ISDN_CMD_...) - arg = depends on command. - num = depends on command. - - Returnvalue: - >=0 on success, else error-code (-ENODEV etc.) - - int (*writebuf_skb)(int, int, int, struct sk_buff *) - - ***CHANGE0.7.4: New field. - ***CHANGEI.1.21: New field. - - This field has to be preset by the HL-driver. The given function will - be called by the LL for delivering data to be send via B-Channel. - - - Parameter: - int driver-Id ***CHANGE0.7.4: New parameter. - int channel-number locally to the HL-driver. (starts with 0) - int ack ***ChangeI1.21: New parameter - If this is !0, the driver has to signal the delivery - by sending an ISDN_STAT_BSENT. If this is 0, the driver - MUST NOT send an ISDN_STAT_BSENT. - struct sk_buff * Pointer to sk_buff containing data to be send via - B-channel. - - Returnvalue: - Length of data accepted on success, else error-code (-EINVAL on - oversized packets etc.) - - int (*writecmd)(u_char*, int, int, int, int); - - This field has to be preset by the HL-driver. The given function will be - called to perform write-requests on /dev/isdnctrl (i.e. sending commands - to the card) The data-format is hardware-specific. This function is - intended for debugging only. It is not necessary for normal operation - and never will be called by the tty-emulation- or network-code. If - this function is not supported, the driver has to set NULL here. - - Parameter: - u_char* pointer to data. - int length of data. - int flag: 0 = call from within kernel-space. (HL-driver must use - memcpy, may NOT use schedule()) - 1 = call from user-space. (HL-driver must use - memcpy_fromfs, use of schedule() allowed) - int driver-Id. - int channel-number locally to the HL-driver. (starts with 0) - -***CHANGEI1.14: The driver-Id and channel-number are new since this revision. - - Returnvalue: - Length of data accepted on success, else error-code (-EINVAL etc.) - - int (*readstat)(u_char*, int, int, int, int); - - This field has to be preset by the HL-driver. The given function will be - called to perform read-requests on /dev/isdnctrl (i.e. reading replies - from the card) The data-format is hardware-specific. This function is - intended for debugging only. It is not necessary for normal operation - and never will be called by the tty-emulation- or network-code. If - this function is not supported, the driver has to set NULL here. - - Parameter: - u_char* pointer to data. - int length of data. - int flag: 0 = call from within kernel-space. (HL-driver must use - memcpy, may NOT use schedule()) - 1 = call from user-space. (HL-driver must use - memcpy_fromfs, use of schedule() allowed) - int driver-Id. - int channel-number locally to the HL-driver. (starts with 0) - -***CHANGEI1.14: The driver-Id and channel-number are new since this revision. - - Returnvalue: - Length of data on success, else error-code (-EINVAL etc.) - - char id[20]; - ***CHANGE0.7: New since this version. - - This string has to be preset by the HL-driver. Its purpose is for - identification of the driver by the user. Eg.: it is shown in the - status-info of /dev/isdninfo. Furthermore it is used as Id for binding - net-interfaces to a specific channel. If a string of length zero is - given, upon return, isdn4linux will replace it by a generic name. (line0, - line1 etc.) It is recommended to make this string configurable during - module-load-time. (copy a global variable to this string.) For doing that, - modules 1.2.8 or newer are necessary. - -2. Description of the commands, a HL-driver has to support: - - All commands will be performed by calling the function command() described - above from within the LL. The field command of the struct-parameter will - contain the desired command, the field driver is always set to the - appropriate driver-Id. - - Until now, the following commands are defined: - -***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing - the old "num" and a new setup_type struct used for ISDN_CMD_DIAL - and ISDN_STAT_ICALL callback. - - ISDN_CMD_IOCTL: - - This command is intended for performing ioctl-calls for configuring - hardware or similar purposes (setting port-addresses, loading firmware - etc.) For this purpose, in the LL all ioctl-calls with an argument - >= IIOCDRVCTL (0x100) will be handed transparently to this - function after subtracting 0x100 and placing the result in arg. - Example: - If a userlevel-program calls ioctl(0x101,...) the function gets - called with the field command set to 1. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_IOCTL - arg = Original ioctl-cmd - IIOCDRVCTL - parm.num = first bytes filled with (unsigned long)arg - - Returnvalue: - Depending on driver. - - - ISDN_CMD_DIAL: - - This command is used to tell the HL-driver it should dial a given - number. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_DIAL - arg = channel-number locally to the driver. (starting with 0) - - parm.setup.phone = An ASCII-String containing the number to dial. - parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN. - parm.setup.si1 = The Service-Indicator. - parm.setup.si2 = Additional Service-Indicator. - - If the Line has been designed as SPV (a special german - feature, meaning semi-leased-line) the phone has to - start with an "S". - ***CHANGE0.6: In previous versions the EAZ has been given in the - highbyte of arg. - ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo. - - ISDN_CMD_ACCEPTD: - - With this command, the HL-driver is told to accept a D-Channel-setup. - (Response to an incoming call) - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ACCEPTD - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_ACCEPTB: - - With this command, the HL-driver is told to perform a B-Channel-setup. - (after establishing D-Channel-Connection) - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ACCEPTB - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_HANGUP: - - With this command, the HL-driver is told to hangup (B-Channel if - established first, then D-Channel). This command is also used for - actively rejecting an incoming call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_HANGUP - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_CLREAZ: - - With this command, the HL-driver is told not to signal incoming - calls to the LL. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_CLREAZ - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_SETEAZ: - - With this command, the HL-driver is told to signal incoming calls for - the given EAZs/MSNs to the LL. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETEAZ - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the desired EAZ's/MSN's - (comma-separated). If an empty String is given, the - HL-driver should respond to ALL incoming calls, - regardless of the destination-address. - ***CHANGE0.6: New since this version the "empty-string"-feature. - - ISDN_CMD_GETEAZ: (currently unused) - - With this command, the HL-driver is told to report the current setting - given with ISDN_CMD_SETEAZ. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETEAZ - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the current EAZ's/MSN's - - ISDN_CMD_SETSIL: (currently unused) - - With this command, the HL-driver is told to signal only incoming - calls with the given Service-Indicators. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETSIL - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the desired Service-Indicators. - - ISDN_CMD_GETSIL: (currently unused) - - With this command, the HL-driver is told to return the current - Service-Indicators it will respond to. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETSIL - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the current Service-Indicators. - - ISDN_CMD_SETL2: - - With this command, the HL-driver is told to select the given Layer-2- - protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or - ISDN_CMD_ACCEPTD. - - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETL2 - arg = channel-number locally to the driver. (starting with 0) - logical or'ed with (protocol-Id << 8) - protocol-Id is one of the constants ISDN_PROTO_L2... - parm = unused. - - ISDN_CMD_GETL2: (currently unused) - - With this command, the HL-driver is told to return the current - setting of the Layer-2-protocol. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETL2 - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - Returnvalue: - current protocol-Id (one of the constants ISDN_L2_PROTO) - - ISDN_CMD_SETL3: - - With this command, the HL-driver is told to select the given Layer-3- - protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or - ISDN_CMD_ACCEPTD. - - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETL3 - arg = channel-number locally to the driver. (starting with 0) - logical or'ed with (protocol-Id << 8) - protocol-Id is one of the constants ISDN_PROTO_L3... - parm.fax = Pointer to T30_s fax struct. (fax usage only) - - ISDN_CMD_GETL2: (currently unused) - - With this command, the HL-driver is told to return the current - setting of the Layer-3-protocol. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETL3 - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - Returnvalue: - current protocol-Id (one of the constants ISDN_L3_PROTO) - - ISDN_CMD_PROCEED: - - With this command, the HL-driver is told to proceed with a incoming call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_PROCEED - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 with - PROCEED message - - ISDN_CMD_ALERT: - - With this command, the HL-driver is told to alert a proceeding call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ALERT - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 with - ALERT message - - ISDN_CMD_REDIR: - - With this command, the HL-driver is told to redirect a call in proceeding - or alerting state. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_REDIR - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 protocol - setup.screen= screening indicator - setup.phone = redirected to party number - - ISDN_CMD_PROT_IO: - - With this call, the LL-driver invokes protocol specific features through - the LL. - The call is not implicitely bound to a connection. - - Parameter: - driver = driver-Id - command = ISDN_CMD_PROT_IO - arg = The lower 8 Bits define the addressed protocol as defined - in ISDN_PTYPE..., the upper bits are used to differentiate - the protocol specific CMD. - - para = protocol and function specific. See isdnif.h for detail. - - - ISDN_CMD_FAXCMD: - - With this command the HL-driver receives a fax sub-command. - For details refer to INTERFACE.fax - - Parameter: - driver = driver-Id. - command = ISDN_CMD_FAXCMD - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - -3. Description of the events to be signaled by the HL-driver to the LL. - - All status-changes are signaled via calling the previously described - function statcallb(). The field command of the struct isdn_cmd has - to be set by the HL-driver with the appropriate Status-Id (event-number). - The field arg has to be set to the channel-number (locally to the driver, - starting with 0) to which this event applies. (Exception: STAVAIL-event) - - Until now, the following Status-Ids are defined: - - ISDN_STAT_AVAIL: - - With this call, the HL-driver signals the availability of new data - for readstat(). Used only for debugging-purposes, see description - of readstat(). - - Parameter: - driver = driver-Id - command = ISDN_STAT_STAVAIL - arg = length of available data. - parm = unused. - - ISDN_STAT_ICALL: - ISDN_STAT_ICALLW: - - With this call, the HL-driver signals an incoming call to the LL. - If ICALLW is signalled the incoming call is a waiting call without - a available B-chan. - - Parameter: - driver = driver-Id - command = ISDN_STAT_ICALL - arg = channel-number, locally to the driver. (starting with 0) - para.setup.phone = Callernumber. - para.setup.eazmsn = CalledNumber. - para.setup.si1 = Service Indicator. - para.setup.si2 = Additional Service Indicator. - para.setup.plan = octet 3 from Calling party number Information Element. - para.setup.screen = octet 3a from Calling party number Information Element. - - Return: - 0 = No device matching this call. - 1 = At least one device matching this call (RING on ttyI). - HL-driver may send ALERTING on the D-channel in this case. - 2 = Call will be rejected. - 3 = Incoming called party number is currently incomplete. - Additional digits are required. - Used for signalling with PtP connections. - 4 = Call will be held in a proceeding state - (HL driver sends PROCEEDING) - Used when a user space prog needs time to interpret a call - para.setup.eazmsn may be filled with an uus1 message of - 30 octets maximum. Empty string if no uus. - 5 = Call will be actively deflected to another party - Only available in DSS1/EURO protocol - para.setup.phone must be set to destination party number - para.setup.eazmsn may be filled with an uus1 message of - 30 octets maximum. Empty string if no uus. - -1 = An error happened. (Invalid parameters for example.) - The keypad support now is included in the dial command. - - - ISDN_STAT_RUN: - - With this call, the HL-driver signals availability of the ISDN-card. - (after initializing, loading firmware) - - Parameter: - driver = driver-Id - command = ISDN_STAT_RUN - arg = unused. - parm = unused. - - ISDN_STAT_STOP: - - With this call, the HL-driver signals unavailability of the ISDN-card. - (before unloading, while resetting/reconfiguring the card) - - Parameter: - driver = driver-Id - command = ISDN_STAT_STOP - arg = unused. - parm = unused. - - ISDN_STAT_DCONN: - - With this call, the HL-driver signals the successful establishment of - a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL) - - Parameter: - driver = driver-Id - command = ISDN_STAT_DCONN - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_BCONN: - - With this call, the HL-driver signals the successful establishment of - a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the - remote-station has initiated establishment) - - The HL driver should call this when the logical l2/l3 protocol - connection on top of the physical B-channel is established. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BCONN - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII-String, containing type of connection (for analog - modem only). This will be appended to the CONNECT message - e.g. 14400/V.32bis - - ISDN_STAT_DHUP: - - With this call, the HL-driver signals the shutdown of a - D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP, - or caused by a remote-hangup or if the remote-station has actively - rejected a call. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DHUP - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_BHUP: - - With this call, the HL-driver signals the shutdown of a - B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP, - or caused by a remote-hangup. - - The HL driver should call this as soon as the logical l2/l3 protocol - connection on top of the physical B-channel is released. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BHUP - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_CINF: - - With this call, the HL-driver delivers charge-unit information to the - LL. - - Parameter: - driver = driver-Id - command = ISDN_STAT_CINF - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII string containing charge-units (digits only). - - ISDN_STAT_LOAD: (currently unused) - - ISDN_STAT_UNLOAD: - - With this call, the HL-driver signals that it will be unloaded now. This - tells the LL to release all corresponding data-structures. - - Parameter: - driver = driver-Id - command = ISDN_STAT_UNLOAD - arg = unused. - parm = unused. - - ISDN_STAT_BSENT: - - With this call the HL-driver signals the delivery of a data-packet. - This callback is used by the network-interfaces only, tty-Emulation - does not need this call. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BSENT - arg = channel-number, locally to the driver. (starting with 0) - parm.length = ***CHANGEI.1.21: New field. - the driver has to set this to the original length - of the skb at the time of receiving it from the linklevel. - - ISDN_STAT_NODCH: - - With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if - no D-Channel is available. - - Parameter: - driver = driver-Id - command = ISDN_STAT_NODCH - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_ADDCH: - - This call is for HL-drivers, which are unable to check card-type - or numbers of supported channels before they have loaded any firmware - using ioctl. Those HL-driver simply set the channel-parameter to a - minimum channel-number when registering, and later if they know - the real amount, perform this call, allocating additional channels. - - Parameter: - driver = driver-Id - command = ISDN_STAT_ADDCH - arg = number of channels to be added. - parm = unused. - - ISDN_STAT_CAUSE: - - With this call, the HL-driver delivers CAUSE-messages to the LL. - Currently the LL does not use this messages. Their contents is simply - logged via kernel-messages. Therefore, currently the format of the - messages is completely free. However they should be printable. - - Parameter: - driver = driver-Id - command = ISDN_STAT_NODCH - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII string containing CAUSE-message. - - ISDN_STAT_DISPLAY: - - With this call, the HL-driver delivers DISPLAY-messages to the LL. - Currently the LL does not use this messages. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DISPLAY - arg = channel-number, locally to the driver. (starting with 0) - para.display= string containing DISPLAY-message. - - ISDN_STAT_PROT: - - With this call, the HL-driver delivers protocol specific infos to the LL. - The call is not implicitely bound to a connection. - - Parameter: - driver = driver-Id - command = ISDN_STAT_PROT - arg = The lower 8 Bits define the addressed protocol as defined - in ISDN_PTYPE..., the upper bits are used to differentiate - the protocol specific STAT. - - para = protocol and function specific. See isdnif.h for detail. - - ISDN_STAT_DISCH: - - With this call, the HL-driver signals the LL to disable or enable the - use of supplied channel and driver. - The call may be used to reduce the available number of B-channels after - loading the driver. The LL has to ignore a disabled channel when searching - for free channels. The HL driver itself never delivers STAT callbacks for - disabled channels. - The LL returns a nonzero code if the operation was not successful or the - selected channel is actually regarded as busy. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DISCH - arg = channel-number, locally to the driver. (starting with 0) - parm.num[0] = 0 if channel shall be disabled, else enabled. - - ISDN_STAT_L1ERR: - - ***CHANGEI1.21 new status message. - A signal can be sent to the linklevel if an Layer1-error results in - packet-loss on receive or send. The field errcode of the cmd.parm - union describes the error more precisely. - - Parameter: - driver = driver-Id - command = ISDN_STAT_L1ERR - arg = channel-number, locally to the driver. (starting with 0) - parm.errcode= ISDN_STAT_L1ERR_SEND: Packet lost while sending. - ISDN_STAT_L1ERR_RECV: Packet lost while receiving. - ISDN_STAT_FAXIND: - - With this call the HL-driver signals a fax sub-command to the LL. - For details refer to INTERFACE.fax - - Parameter: - driver = driver-Id. - command = ISDN_STAT_FAXIND - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax deleted file mode 100644 index 9c8c6d914ec7..000000000000 --- a/Documentation/isdn/INTERFACE.fax +++ /dev/null @@ -1,163 +0,0 @@ -$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $ - - -Description of the fax-subinterface between linklevel and hardwarelevel of - isdn4linux. - - The communication between linklevel (LL) and hardwarelevel (HL) for fax - is based on the struct T30_s (defined in isdnif.h). - This struct is allocated in the LL. - In order to use fax, the LL provides the pointer to this struct with the - command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup - and when a new channel to a new connection is assigned. - - -Data handling: - In send-mode the HL-driver has to handle the codes and the bit-order - conversion by itself. - In receive-mode the LL-driver takes care of the bit-order conversion - (specified by +FBOR) - -Structure T30_s description: - - This structure stores the values (set by AT-commands), the remote- - capability-values and the command-codes between LL and HL. - - If the HL-driver receives ISDN_CMD_FAXCMD, all needed information - is in this struct set by the LL. - To signal information to the LL, the HL-driver has to set the - parameters and use ISDN_STAT_FAXIND. - (Please refer to INTERFACE) - -Structure T30_s: - - All members are 8-bit unsigned (__u8) - - - resolution - - rate - - width - - length - - compression - - ecm - - binary - - scantime - - id[] - Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ... - - - r_resolution - - r_rate - - r_width - - r_length - - r_compression - - r_ecm - - r_binary - - r_scantime - - r_id[] - Remote faxmachine's parameters. To be set by HL-driver. - - - phase - Defines the actual state of fax connection. Set by HL or LL - depending on progress and type of connection. - If the phase changes because of an AT command, the LL driver - changes this value. Otherwise the HL-driver takes care of it, but - only necessary on call establishment (from IDLE to PHASE_A). - (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E]) - - - direction - Defines outgoing/send or incoming/receive connection. - (ISDN_TTY_FAX_CONN_[IN,OUT]) - - - code - Commands from LL to HL; possible constants : - ISDN_TTY_FAX_DR signals +FDR command to HL - - ISDN_TTY_FAX_DT signals +FDT command to HL - - ISDN_TTY_FAX_ET signals +FET command to HL - - - Other than that the "code" is set with the hangup-code value at - the end of connection for the +FHNG message. - - - r_code - Commands from HL to LL; possible constants : - ISDN_TTY_FAX_CFR output of +FCFR message. - - ISDN_TTY_FAX_RID output of remote ID set in r_id[] - (+FCSI/+FTSI on send/receive) - - ISDN_TTY_FAX_DCS output of +FDCS and CONNECT message, - switching to phase C. - - ISDN_TTY_FAX_ET signals end of data, - switching to phase D. - - ISDN_TTY_FAX_FCON signals the established, outgoing connection, - switching to phase B. - - ISDN_TTY_FAX_FCON_I signals the established, incoming connection, - switching to phase B. - - ISDN_TTY_FAX_DIS output of +FDIS message and values. - - ISDN_TTY_FAX_SENT signals that all data has been sent - and is acknowledged, - OK message will be sent. - - ISDN_TTY_FAX_PTS signals a msg-confirmation (page sent successful), - depending on fet value: - 0: output OK message (more pages follow) - 1: switching to phase B (next document) - - ISDN_TTY_FAX_TRAIN_OK output of +FDCS and OK message (for receive mode). - - ISDN_TTY_FAX_EOP signals end of data in receive mode, - switching to phase D. - - ISDN_TTY_FAX_HNG output of the +FHNG and value set by code and - OK message, switching to phase E. - - - - badlin - Value of +FBADLIN - - - badmul - Value of +FBADMUL - - - bor - Value of +FBOR - - - fet - Value of +FET command in send-mode. - Set by HL in receive-mode for +FET message. - - - pollid[] - ID-string, set by +FCIG - - - cq - Value of +FCQ - - - cr - Value of +FCR - - - ctcrty - Value of +FCTCRTY - - - minsp - Value of +FMINSP - - - phcto - Value of +FPHCTO - - - rel - Value of +FREL - - - nbc - Value of +FNBC (0,1) - (+FNBC is not a known class 2 fax command, I added this to change the - automatic "best capabilities" connection in the eicon HL-driver) - - -Armin -mac@melware.de - diff --git a/Documentation/isdn/README b/Documentation/isdn/README deleted file mode 100644 index 74bd2bdb455b..000000000000 --- a/Documentation/isdn/README +++ /dev/null @@ -1,599 +0,0 @@ -README for the ISDN-subsystem - -1. Preface - - 1.1 Introduction - - This README describes how to set up and how to use the different parts - of the ISDN-subsystem. - - For using the ISDN-subsystem, some additional userlevel programs are - necessary. Those programs and some contributed utilities are available - at - - ftp.isdn4linux.de - - /pub/isdn4linux/isdn4k-utils-.tar.gz - - - We also have set up a mailing-list: - - The isdn4linux-project originates in Germany, and therefore by historical - reasons, the mailing-list's primary language is german. However mails - written in english have been welcome all the time. - - to subscribe: write a email to majordomo@listserv.isdn4linux.de, - Subject irrelevant, in the message body: - subscribe isdn4linux - - To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de - - This mailinglist is bidirectionally gated to the newsgroup - - de.alt.comm.isdn4linux - - There is also a well maintained FAQ in English available at - https://www.mhessler.de/i4lfaq/ - It can be viewed online, or downloaded in sgml/text/html format. - The FAQ can also be viewed online at - https://www.isdn4linux.de/faq/i4lfaq.html - or downloaded from - ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/ - - 1.1 Technical details - - In the following Text, the terms MSN and EAZ are used. - - MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies - to Euro(EDSS1)-type lines. Usually it is simply the phone number. - - EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and - applies to German 1TR6-type lines. This is a one-digit string, - simply appended to the base phone number - - The internal handling is nearly identical, so replace the appropriate - term to that one, which applies to your local ISDN-environment. - - When the link-level-module isdn.o is loaded, it supports up to 16 - low-level-modules with up to 64 channels. (The number 64 is arbitrarily - chosen and can be configured at compile-time --ISDN_MAX in isdn.h). - A low-level-driver can register itself through an interface (which is - defined in isdnif.h) and gets assigned a slot. - The following char-devices are made available for each channel: - - A raw-control-device with the following functions: - write: raw D-channel-messages (format: depends on driver). - read: raw D-channel-messages (format: depends on driver). - ioctl: depends on driver, i.e. for the ICN-driver, the base-address of - the ports and the shared memory on the card can be set and read - also the boot-code and the protocol software can be loaded into - the card. - - O N L Y !!! for debugging (no locking against other devices): - One raw-data-device with the following functions: - write: data to B-channel. - read: data from B-channel. - - In addition the following devices are made available: - - 128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator: - The functionality is almost the same as that of a serial device - (the line-discs are handled by the kernel), which lets you run - SLIP, CSLIP and asynchronous PPP through the devices. We have tested - Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax. - - The modem-emulation supports the following: - 1.3.1 Commands: - - ATA Answer incoming call. - ATD Dial, the number may contain: - [0-9] and [,#.*WPT-S] - the latter are ignored until 'S'. - The 'S' must precede the number, if - the line is a SPV (German 1TR6). - ATE0 Echo off. - ATE1 Echo on (default). - ATH Hang-up. - ATH1 Off hook (ignored). - ATH0 Hang-up. - ATI Return "ISDN for Linux...". - ATI0 " - ATI1 " - ATI2 Report of last connection. - ATO On line (data mode). - ATQ0 Enable result codes (default). - ATQ1 Disable result codes (default). - ATSx=y Set register x to y. - ATSx? Show contents of register x. - ATV0 Numeric responses. - ATV1 English responses (default). - ATZ Load registers and EAZ/MSN from Profile. - AT&Bx Set Send-Packet-size to x (max. 4000) - The real packet-size may be limited by the - low-level-driver used. e.g. the HiSax-Module- - limit is 2000. You will get NO Error-Message, - if you set it to higher values, because at the - time of giving this command the corresponding - driver may not be selected (see "Automatic - Assignment") however the size of outgoing packets - will be limited correctly. - AT&D0 Ignore DTR - AT&D2 DTR-low-edge: Hang up and return to - command mode (default). - AT&D3 Same as AT&D2 but also resets all registers. - AT&Ex Set the EAZ/MSN for this channel to x. - AT&F Reset all registers and profile to "factory-defaults" - AT&Lx Set list of phone numbers to listen on. x is a - list of wildcard patterns separated by semicolon. - If this is set, it has precedence over the MSN set - by AT&E. - AT&Rx Select V.110 bitrate adaption. - This command enables V.110 protocol with 9600 baud - (x=9600), 19200 baud (x=19200) or 38400 baud - (x=38400). A value of x=0 disables V.110 switching - back to default X.75. This command sets the following - Registers: - Reg 14 (Layer-2 protocol): - x = 0: 0 - x = 9600: 7 - x = 19200: 8 - x = 38400: 9 - Reg 18.2 = 1 - Reg 19 (Additional Service Indicator): - x = 0: 0 - x = 9600: 197 - x = 19200: 199 - x = 38400: 198 - Note on value in Reg 19: - There is _NO_ common convention for 38400 baud. - The value 198 is chosen arbitrarily. Users - _MUST_ negotiate this value before establishing - a connection. - AT&Sx Set window-size (x = 1..8) (not yet implemented) - AT&V Show all settings. - AT&W0 Write registers and EAZ/MSN to profile. See also - iprofd (5.c in this README). - AT&X0 BTX-mode and T.70-mode off (default) - AT&X1 BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0) - AT&X2 T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0) - AT+Rx Resume a suspended call with CallID x (x = 1,2,3...) - AT+Sx Suspend a call with CallID x (x = 1,2,3...) - - For voice-mode commands refer to README.audio - - 1.3.2 Escape sequence: - During a connection, the emulation reacts just like - a normal modem to the escape sequence +++. - (The escape character - default '+' - can be set in the - register 2). - The DELAY must at least be 1.5 seconds long and delay - between the escape characters must not exceed 0.5 seconds. - - 1.3.3 Registers: - - Nr. Default Description - 0 0 Answer on ring number. - (no auto-answer if S0=0). - 1 0 Count of rings. - 2 43 Escape character. - (a value >= 128 disables the escape sequence). - 3 13 Carriage return character (ASCII). - 4 10 Line feed character (ASCII). - 5 8 Backspace character (ASCII). - 6 3 Delay in seconds before dialing. - 7 60 Wait for carrier. - 8 2 Pause time for comma (ignored) - 9 6 Carrier detect time (ignored) - 10 7 Carrier loss to disconnect time (ignored). - 11 70 Touch tone timing (ignored). - 12 69 Bit coded register: - Bit 0: 0 = Suppress response messages. - 1 = Show response messages. - Bit 1: 0 = English response messages. - 1 = Numeric response messages. - Bit 2: 0 = Echo off. - 1 = Echo on. - Bit 3 0 = DCD always on. - 1 = DCD follows carrier. - Bit 4 0 = CTS follows RTS - 1 = Ignore RTS, CTS always on. - Bit 5 0 = return to command mode on DTR low. - 1 = Same as 0 but also resets all - registers. - See also register 13, bit 2 - Bit 6 0 = DSR always on. - 1 = DSR only on if channel is available. - Bit 7 0 = Cisco-PPP-flag-hack off (default). - 1 = Cisco-PPP-flag-hack on. - 13 0 Bit coded register: - Bit 0: 0 = Use delayed tty-send-algorithm - 1 = Direct tty-send. - Bit 1: 0 = T.70 protocol (Only for BTX!) off - 1 = T.70 protocol (Only for BTX!) on - Bit 2: 0 = Don't hangup on DTR low. - 1 = Hangup on DTR low. - Bit 3: 0 = Standard response messages - 1 = Extended response messages - Bit 4: 0 = CALLER NUMBER before every RING. - 1 = CALLER NUMBER after first RING. - Bit 5: 0 = T.70 extended protocol off - 1 = T.70 extended protocol on - Bit 6: 0 = Special RUNG Message off - 1 = Special RUNG Message on - "RUNG" is delivered on a ttyI, if - an incoming call happened (RING) and - the remote party hung up before any - local ATA was given. - Bit 7: 0 = Don't show display messages from net - 1 = Show display messages from net - (S12 Bit 1 must be 0 too) - 14 0 Layer-2 protocol: - 0 = X75/LAPB with I-frames - 1 = X75/LAPB with UI-frames - 2 = X75/LAPB with BUI-frames - 3 = HDLC - 4 = Transparent (audio) - 7 = V.110, 9600 baud - 8 = V.110, 19200 baud - 9 = V.110, 38400 baud - 10 = Analog Modem (only if hardware supports this) - 11 = Fax G3 (only if hardware supports this) - 15 0 Layer-3 protocol: - 0 = transparent - 1 = transparent with audio features (e.g. DSP) - 2 = Fax G3 Class 2 commands (S14 has to be set to 11) - 3 = Fax G3 Class 1 commands (S14 has to be set to 11) - 16 250 Send-Packet-size/16 - 17 8 Window-size (not yet implemented) - 18 4 Bit coded register, Service-Octet-1 to accept, - or to be used on dialout: - Bit 0: Service 1 (audio) when set. - Bit 1: Service 5 (BTX) when set. - Bit 2: Service 7 (data) when set. - Note: It is possible to set more than one - bit. In this case, on incoming calls - the selected services are accepted, - and if the service is "audio", the - Layer-2-protocol is automatically - changed to 4 regardless of the setting - of register 14. On outgoing calls, - the most significant 1-bit is chosen to - select the outgoing service octet. - 19 0 Service-Octet-2 - 20 0 Bit coded register (readonly) - Service-Octet-1 of last call. - Bit mapping is the same as register 18 - 21 0 Bit coded register (readonly) - Set on incoming call (during RING) to - octet 3 of calling party number IE (Numbering plan) - See section 4.5.10 of ITU Q.931 - 22 0 Bit coded register (readonly) - Set on incoming call (during RING) to - octet 3a of calling party number IE (Screening info) - See section 4.5.10 of ITU Q.931 - 23 0 Bit coded register: - Bit 0: 0 = Add CPN to RING message off - 1 = Add CPN to RING message on - Bit 1: 0 = Add CPN to FCON message off - 1 = Add CPN to FCON message on - Bit 2: 0 = Add CDN to RING/FCON message off - 1 = Add CDN to RING/FCON message on - - Last but not least a (at the moment fairly primitive) device to request - the line-status (/dev/isdninfo) is made available. - - Automatic assignment of devices to lines: - - All inactive physical lines are listening to all EAZs for incoming - calls and are NOT assigned to a specific tty or network interface. - When an incoming call is detected, the driver looks first for a network - interface and then for an opened tty which: - - 1. is configured for the same EAZ. - 2. has the same protocol settings for the B-channel. - 3. (only for network interfaces if the security flag is set) - contains the caller number in its access list. - 4. Either the channel is not bound exclusively to another Net-interface, or - it is bound AND the other checks apply to exactly this interface. - (For usage of the bind-features, refer to the isdnctrl-man-page) - - Only when a matching interface or tty is found is the call accepted - and the "connection" between the low-level-layer and the link-level-layer - is established and kept until the end of the connection. - In all other cases no connection is established. Isdn4linux can be - configured to either do NOTHING in this case (which is useful, if - other, external devices with the same EAZ/MSN are connected to the bus) - or to reject the call actively. (isdnctrl busreject ...) - - For an outgoing call, the inactive physical lines are searched. - The call is placed on the first physical line, which supports the - requested protocols for the B-channel. If a net-interface, however - is pre-bound to a channel, this channel is used directly. - - This makes it possible to configure several network interfaces and ttys - for one EAZ, if the network interfaces are set to secure operation. - If an incoming call matches one network interface, it gets connected to it. - If another incoming call for the same EAZ arrives, which does not match - a network interface, the first tty gets a "RING" and so on. - -2 System prerequisites: - - ATTENTION! - - Always use the latest module utilities. The current version is - named in Documentation/Changes. Some old versions of insmod - are not capable of setting the driver-Ids correctly. - -3. Lowlevel-driver configuration. - - Configuration depends on how the drivers are built. See the - README. for information on driver-specific setup. - -4. Device-inodes - - The major and minor numbers and their names are described in - Documentation/admin-guide/devices.rst. The major numbers are: - - 43 for the ISDN-tty's. - 44 for the ISDN-callout-tty's. - 45 for control/info/debug devices. - -5. Application - - a) For some card-types, firmware has to be loaded into the cards, before - proceeding with device-independent setup. See README. - for how to do that. - - b) If you only intend to use ttys, you are nearly ready now. - - c) If you want to have really permanent "Modem"-settings on disk, you - can start the daemon iprofd. Give it a path to a file at the command- - line. It will store the profile-settings in this file every time - an AT&W0 is performed on any ISDN-tty. If the file already exists, - all profiles are initialized from this file. If you want to unload - any of the modules, kill iprofd first. - - d) For networking, continue: Create an interface: - isdnctrl addif isdn0 - - e) Set the EAZ (or MSN for Euro-ISDN): - isdnctrl eaz isdn0 2 - - (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your - real MSN e.g.: Phone-Number) - - f) Set the number for outgoing calls on the interface: - isdnctrl addphone isdn0 out 1234567 - ... (this can be executed more than once, all assigned numbers are - tried in order) - and the number(s) for incoming calls: - isdnctrl addphone isdn0 in 1234567 - - g) Set the timeout for hang-up: - isdnctrl huptimeout isdn0 - - h) additionally you may activate charge-hang-up (= Hang up before - next charge-info, this only works, if your isdn-provider transmits - the charge-info during and after the connection): - isdnctrl chargehup isdn0 on - - i) Set the dial mode of the interface: - isdnctrl dialmode isdn0 auto - "off" means that you (or the system) cannot make any connection - (neither incoming or outgoing connections are possible). Use - this if you want to be sure that no connections will be made. - "auto" means that the interface is in auto-dial mode, and will - attempt to make a connection whenever a network data packet needs - the interface's link. Note that this can cause unexpected dialouts, - and lead to a high phone bill! Some daemons or other pc's that use - this interface can cause this. - Incoming connections are also possible. - "manual" is a dial mode created to prevent the unexpected dialouts. - In this mode, the interface will never make any connections on its - own. You must explicitly initiate a connection with "isdnctrl dial - isdn0". However, after an idle time of no traffic as configured for - the huptimeout value with isdnctrl, the connection _will_ be ended. - If you don't want any automatic hangup, set the huptimeout value to 0. - "manual" is the default. - - j) Setup the interface with ifconfig as usual, and set a route to it. - - k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use - the script tools/tcltk/isdnmon. You can add actions for line-status - changes. See the comments at the beginning of the script for how to - do that. There are other tty-based tools in the tools-subdirectory - contributed by Michael Knigge (imon), Volker Götz (imontty) and - Andreas Kool (isdnmon). - - l) For initial testing, you can set the verbose-level to 2 (default: 0). - Then all incoming calls are logged, even if they are not addressed - to one of the configured net-interfaces: - isdnctrl verbose 2 - - Now you are ready! A ping to the set address should now result in an - automatic dial-out (look at syslog kernel-messages). - The phone numbers and EAZs can be assigned at any time with isdnctrl. - You can add as many interfaces as you like with addif following the - directions above. Of course, there may be some limitations. But we have - tested as many as 20 interfaces without any problem. However, if you - don't give an interface name to addif, the kernel will assign a name - which starts with "eth". The number of "eth"-interfaces is limited by - the kernel. - -5. Additional options for isdnctrl: - - "isdnctrl secure on" - Only incoming calls, for which the caller-id is listed in the access - list of the interface are accepted. You can add caller-id's With the - command "isdnctrl addphone in " - Euro-ISDN does not transmit the leading '0' of the caller-id for an - incoming call, therefore you should configure it accordingly. - If the real number for the dialout e.g. is "09311234567" the number - to configure here is "9311234567". The pattern-match function - works similar to the shell mechanism. - - ? one arbitrary digit - * zero or arbitrary many digits - [123] one of the digits in the list - [1-5] one digit between '1' and '5' - a '^' as the first character in a list inverts the list - - - "isdnctrl secure off" - Switch off secure operation (default). - - "isdnctrl ihup [on|off]" - Switch the hang-up-timer for incoming calls on or off. - - "isdnctrl eaz " - Returns the EAZ of an interface. - - "isdnctrl delphone in|out " - Deletes a number from one of the access-lists of the interface. - - "isdnctrl delif " - Removes the interface (and possible slaves) from the kernel. - (You have to unregister it with "ifconfig down" before). - - "isdnctrl callback [on|off]" - Switches an interface to callback-mode. In this mode, an incoming call - will be rejected and after this the remote-station will be called. If - you test this feature by using ping, some routers will re-dial very - quickly, so that the callback from isdn4linux may not be recognized. - In this case use ping with the option -i to increase the interval - between echo-packets. - - "isdnctrl cbdelay [seconds]" - Sets the delay (default 5 sec) between an incoming call and start of - dialing when callback is enabled. - - "isdnctrl cbhup [on|off]" - This enables (default) or disables an active hangup (reject) when getting an - incoming call for an interface which is configured for callback. - - "isdnctrl encap " - Selects the type of packet-encapsulation. The encapsulation can be changed - only while an interface is down. - - At the moment the following values are supported: - - rawip (Default) Selects raw-IP-encapsulation. This means, MAC-headers - are stripped off. - ip IP with type-field. Same as IP but the type-field of the MAC-header - is preserved. - x25iface X.25 interface encapsulation (first byte semantics as defined in - ../networking/x25-iface.txt). Use this for running the linux - X.25 network protocol stack (AF_X25 sockets) on top of isdn. - cisco-h A special-mode for communicating with a Cisco, which is configured - to do "hdlc" - ethernet No stripping. Packets are sent with full MAC-header. - The Ethernet-address of the interface is faked, from its - IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values. - syncppp Synchronous PPP - - uihdlc HDLC with UI-frame-header (for use with DOS ISPA, option -h1) - - - NOTE: x25iface encapsulation is currently experimental. Please - read README.x25 for further details - - - Watching packets, using standard-tcpdump will fail for all encapsulations - except ethernet because tcpdump does not know how to handle packets - without MAC-header. A patch for tcpdump is included in the utility-package - mentioned above. - - "isdnctrl l2_prot " - Selects a layer-2-protocol. - (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available. - With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be - possible too. See README.x25 for x25 related l2 protocols.) - - isdnctrl l3_prot - The same for layer-3. (At the moment only "trans" is allowed) - - "isdnctrl list " - Shows all parameters of an interface and the charge-info. - Try "all" as the interface name. - - "isdnctrl hangup " - Forces hangup of an interface. - - "isdnctrl bind , [exclusive]" - If you are using more than one ISDN card, it is sometimes necessary to - dial out using a specific card or even preserve a specific channel for - dialout of a specific net-interface. This can be done with the above - command. Replace by whatever you assigned while loading the - module. The is counted from zero. The upper limit - depends on the card used. At the moment no card supports more than - 2 channels, so the upper limit is one. - - "isdnctrl unbind " - unbinds a previously bound interface. - - "isdnctrl busreject on|off" - If switched on, isdn4linux replies a REJECT to incoming calls, it - cannot match to any configured interface. - If switched off, nothing happens in this case. - You normally should NOT enable this feature, if the ISDN adapter is not - the only device connected to the S0-bus. Otherwise it could happen that - isdn4linux rejects an incoming call, which belongs to another device on - the bus. - - "isdnctrl addslave - Creates a slave interface for channel-bundling. Slave interfaces are - not seen by the kernel, but their ISDN-part can be configured with - isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more - than two channels are to be bundled, feel free to create as many as you - want. InterfaceName must be a real interface, NOT a slave. Slave interfaces - start dialing, if the master interface resp. the previous slave interface - has a load of more than 7000 cps. They hangup if the load goes under 7000 - cps, according to their "huptimeout"-parameter. - - "isdnctrl sdelay secs." - This sets the minimum time an Interface has to be fully loaded, until - it sends a dial-request to its slave. - - "isdnctrl dial " - Forces an interface to start dialing even if no packets are to be - transferred. - - "isdnctrl mapping MSN0,MSN1,MSN2,...MSN9" - This installs a mapping table for EAZ<->MSN-mapping for a single line. - Missing MSN's have to be given as "-" or can be omitted, if at the end - of the commandline. - With this command, it's now possible to have an interface listening to - mixed 1TR6- and Euro-Type lines. In this case, the interface has to be - configured to a 1TR6-type EAZ (one digit). The mapping is also valid - for tty-emulation. Seen from the interface/tty-level the mapping - CAN be used, however it's possible to use single tty's/interfaces with - real MSN's (more digits) also, in which case the mapping will be ignored. - Here is an example: - - You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with - MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO". - - isdnctrl mapping EURO -,987654,987655,987656,-,987655 - ... - isdnctrl eaz isdn0 1 # listen on 12345671(1tr6) and 987654(euro) - ... - isdnctrl eaz isdn1 4 # listen on 12345674(1tr6) only. - ... - isdnctrl eaz isdn2 987654 # listen on 987654(euro) only. - - Same scheme is used with AT&E... at the tty's. - -6. If you want to write a new low-level-driver, you are welcome. - The interface to the link-level-module is described in the file INTERFACE. - If the interface should be expanded for any reason, don't do it - on your own, send me a mail containing the proposed changes and - some reasoning about them. - If other drivers will not be affected, I will include the changes - in the next release. - For developers only, there is a second mailing-list. Write to me - (fritz@isdn4linux.de), if you want to join that list. - -Have fun! - - -Fritz - diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ deleted file mode 100644 index e5dd1addacdd..000000000000 --- a/Documentation/isdn/README.FAQ +++ /dev/null @@ -1,26 +0,0 @@ - -The FAQ for isdn4linux -====================== - -Please note that there is a big FAQ available in the isdn4k-utils. -You find it in: - isdn4k-utils/FAQ/i4lfaq.sgml - -In case you just want to see the FAQ online, or download the newest version, -you can have a look at my website: -https://www.mhessler.de/i4lfaq/ (view + download) -or: -https://www.isdn4linux.de/faq/4lfaq.html (view) - -As the extension tells, the FAQ is in SGML format, and you can convert it -into text/html/... format by using the sgml2txt/sgml2html/... tools. -Alternatively, you can also do a 'configure; make all' in the FAQ directory. - - -Please have a look at the FAQ before posting anything in the Mailinglist, -or the newsgroup! - - -Matthias Hessler -hessler@isdn4linux.de - diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio deleted file mode 100644 index 8ebca19290d9..000000000000 --- a/Documentation/isdn/README.audio +++ /dev/null @@ -1,138 +0,0 @@ -$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $ - -ISDN subsystem for Linux. - Description of audio mode. - -When enabled during kernel configuration, the tty emulator of the ISDN -subsystem is capable of a reduced set of commands to support audio. -This document describes the commands supported and the format of -audio data. - -Commands for enabling/disabling audio mode: - - AT+FCLASS=8 Enable audio mode. - This affects the following registers: - S18: Bits 0 and 2 are set. - S16: Set to 48 and any further change to - larger values is blocked. - AT+FCLASS=0 Disable audio mode. - Register 18 is set to 4. - AT+FCLASS=? Show possible modes. - AT+FCLASS? Report current mode (0 or 8). - -Commands supported in audio mode: - -All audio mode commands have one of the following forms: - - AT+Vxx? Show current setting. - AT+Vxx=? Show possible settings. - AT+Vxx=v Set simple parameter. - AT+Vxx=v,v ... Set complex parameter. - -where xx is a two-character code and v are alphanumerical parameters. -The following commands are supported: - - AT+VNH=x Auto hangup setting. NO EFFECT, supported - for compatibility only. - AT+VNH? Always reporting "1" - AT+VNH=? Always reporting "1" - - AT+VIP Reset all audio parameters. - - AT+VLS=x Line select. x is one of the following: - 0 = No device. - 2 = Phone line. - AT+VLS=? Always reporting "0,2" - AT+VLS? Show current line. - - AT+VRX Start recording. Emulator responds with - CONNECT and starts sending audio data to - the application. See below for data format - - AT+VSD=x,y Set silence-detection parameters. - Possible parameters: - x = 0 ... 31 sensitivity threshold level. - (default 0 , deactivated) - y = 0 ... 255 range of interval in units - of 0.1 second. (default 70) - AT+VSD=? Report possible parameters. - AT+VSD? Show current parameters. - - AT+VDD=x,y Set DTMF-detection parameters. - Only possible if online and during this connection. - Possible parameters: - x = 0 ... 15 sensitivity threshold level. - (default 0 , I4L soft-decode) - (1-15 soft-decode off, hardware on) - y = 0 ... 255 tone duration in units of 5ms. - Not for I4L soft decode (default 8, 40ms) - AT+VDD=? Report possible parameters. - AT+VDD? Show current parameters. - - AT+VSM=x Select audio data format. - Possible parameters: - 2 = ADPCM-2 - 3 = ADPCM-3 - 4 = ADPCM-4 - 5 = aLAW - 6 = uLAW - AT+VSM=? Show possible audio formats. - - AT+VTX Start audio playback. Emulator responds - with CONNECT and starts sending audio data - received from the application via phone line. -General behavior and description of data formats/protocol. - when a connection is made: - - On incoming calls, if the application responds to a RING - with ATA, depending on the calling service, the emulator - responds with either CONNECT (data call) or VCON (voice call). - - On outgoing voice calls, the emulator responds with VCON - upon connection setup. - - Audio recording. - - When receiving audio data, a kind of bisync protocol is used. - Upon AT+VRX command, the emulator responds with CONNECT, and - starts sending audio data to the application. There are several - escape sequences defined, all using DLE (0x10) as Escape char: - - End of audio data. (i.e. caused by a - hangup of the remote side) Emulator stops - recording, responding with VCON. - Abort recording, (send by appl.) Emulator - stops recording, sends DLE,ETX. - Escape sequence for DLE in data stream. - 0 Touchtone "0" received. - ... - 9 Touchtone "9" received. - # Touchtone "#" received. - * Touchtone "*" received. - A Touchtone "A" received. - B Touchtone "B" received. - C Touchtone "C" received. - D Touchtone "D" received. - - q quiet. Silence detected after non-silence. - s silence. Silence detected from the - start of recording. - - Currently unsupported DLE sequences: - - c FAX calling tone received. - b busy tone received. - - Audio playback. - - When sending audio data, upon AT+VTX command, emulator responds with - CONNECT, and starts transferring data from application to the phone line. - The same DLE sequences apply to this mode. - - Full-Duplex-Audio: - - When _both_ commands for recording and playback are given in _one_ - AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected. - In this mode, the only way to stop recording is sending - and the only way to stop playback is to send . - diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap deleted file mode 100644 index a76d74845a4c..000000000000 --- a/Documentation/isdn/README.concap +++ /dev/null @@ -1,259 +0,0 @@ -Description of the "concap" encapsulation protocol interface -============================================================ - -The "concap" interface is intended to be used by network device -drivers that need to process an encapsulation protocol. -It is assumed that the protocol interacts with a linux network device by -- data transmission -- connection control (establish, release) -Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol". - -This is currently only used inside the isdn subsystem. But it might -also be useful to other kinds of network devices. Thus, if you want -to suggest changes that improve usability or performance of the -interface, please let me know. I'm willing to include them in future -releases (even if I needed to adapt the current isdn code to the -changed interface). - - -Why is this useful? -=================== - -The encapsulation protocol used on top of WAN connections or permanent -point-to-point links are frequently chosen upon bilateral agreement. -Thus, a device driver for a certain type of hardware must support -several different encapsulation protocols at once. - -The isdn device driver did already support several different -encapsulation protocols. The encapsulation protocol is configured by a -user space utility (isdnctrl). The isdn network interface code then -uses several case statements which select appropriate actions -depending on the currently configured encapsulation protocol. - -In contrast, LAN network interfaces always used a single encapsulation -protocol which is unique to the hardware type of the interface. The LAN -encapsulation is usually done by just sticking a header on the data. Thus, -traditional linux network device drivers used to process the -encapsulation protocol directly (usually by just providing a hard_header() -method in the device structure) using some hardware type specific support -functions. This is simple, direct and efficient. But it doesn't fit all -the requirements for complex WAN encapsulations. - - - The configurability of the encapsulation protocol to be used - makes isdn network interfaces more flexible, but also much more - complex than traditional lan network interfaces. - - -Many Encapsulation protocols used on top of WAN connections will not just -stick a header on the data. They also might need to set up or release -the WAN connection. They also might want to send other data for their -private purpose over the wire, e.g. ppp does a lot of link level -negotiation before the first piece of user data can be transmitted. -Such encapsulation protocols for WAN devices are typically more complex -than encapsulation protocols for lan devices. Thus, network interface -code for typical WAN devices also tends to be more complex. - - -In order to support Linux' x25 PLP implementation on top of -isdn network interfaces I could have introduced yet another branch to -the various case statements inside drivers/isdn/isdn_net.c. -This eventually made isdn_net.c even more complex. In addition, it made -isdn_net.c harder to maintain. Thus, by identifying an abstract -interface between the network interface code and the encapsulation -protocol, complexity could be reduced and maintainability could be -increased. - - -Likewise, a similar encapsulation protocol will frequently be needed by -several different interfaces of even different hardware type, e.g. the -synchronous ppp implementation used by the isdn driver and the -asynchronous ppp implementation used by the ppp driver have a lot of -similar code in them. By cleanly separating the encapsulation protocol -from the hardware specific interface stuff such code could be shared -better in future. - - -When operating over dial-up-connections (e.g. telephone lines via modem, -non-permanent virtual circuits of wide area networks, ISDN) many -encapsulation protocols will need to control the connection. Therefore, -some basic connection control primitives are supported. The type and -semantics of the connection (i.e the ISO layer where connection service -is provided) is outside our scope and might be different depending on -the encapsulation protocol used, e.g. for a ppp module using our service -on top of a modem connection a connect_request will result in dialing -a (somewhere else configured) remote phone number. For an X25-interface -module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt) -a connect_request will ask for establishing a reliable lapb -datalink connection. - - -The encapsulation protocol currently provides the following -service primitives to the network device. - -- create a new encapsulation protocol instance -- delete encapsulation protocol instance and free all its resources -- initialize (open) the encapsulation protocol instance for use. -- deactivate (close) an encapsulation protocol instance. -- process (xmit) data handed down by upper protocol layer -- receive data from lower (hardware) layer -- process connect indication from lower (hardware) layer -- process disconnect indication from lower (hardware) layer - - -The network interface driver accesses those primitives via callbacks -provided by the encapsulation protocol instance within a -struct concap_proto_ops. - -struct concap_proto_ops{ - - /* create a new encapsulation protocol instance of same type */ - struct concap_proto * (*proto_new) (void); - - /* delete encapsulation protocol instance and free all its resources. - cprot may no longer be referenced after calling this */ - void (*proto_del)(struct concap_proto *cprot); - - /* initialize the protocol's data. To be called at interface startup - or when the device driver resets the interface. All services of the - encapsulation protocol may be used after this*/ - int (*restart)(struct concap_proto *cprot, - struct net_device *ndev, - struct concap_device_ops *dops); - - /* deactivate an encapsulation protocol instance. The encapsulation - protocol may not call any *dops methods after this. */ - int (*close)(struct concap_proto *cprot); - - /* process a frame handed down to us by upper layer */ - int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called for each data entity received from lower layer*/ - int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called when a connection was set up/down. - Protocols that don't process these primitives might fill in - dummy methods here */ - int (*connect_ind)(struct concap_proto *cprot); - int (*disconn_ind)(struct concap_proto *cprot); -}; - - -The data structures are defined in the header file include/linux/concap.h. - - -A Network interface using encapsulation protocols must also provide -some service primitives to the encapsulation protocol: - -- request data being submitted by lower layer (device hardware) -- request a connection being set up by lower layer -- request a connection being released by lower layer - -The encapsulation protocol accesses those primitives via callbacks -provided by the network interface within a struct concap_device_ops. - -struct concap_device_ops{ - - /* to request data be submitted by device */ - int (*data_req)(struct concap_proto *, struct sk_buff *); - - /* Control methods must be set to NULL by devices which do not - support connection control. */ - /* to request a connection be set up */ - int (*connect_req)(struct concap_proto *); - - /* to request a connection be released */ - int (*disconn_req)(struct concap_proto *); -}; - -The network interface does not explicitly provide a receive service -because the encapsulation protocol directly calls netif_rx(). - - - - -An encapsulation protocol itself is actually the -struct concap_proto{ - struct net_device *net_dev; /* net device using our service */ - struct concap_device_ops *dops; /* callbacks provided by device */ - struct concap_proto_ops *pops; /* callbacks provided by us */ - int flags; - void *proto_data; /* protocol specific private data, to - be accessed via *pops methods only*/ - /* - : - whatever - : - */ -}; - -Most of this is filled in when the device requests the protocol to -be reset (opend). The network interface must provide the net_dev and -dops pointers. Other concap_proto members should be considered private -data that are only accessed by the pops callback functions. Likewise, -a concap proto should access the network device's private data -only by means of the callbacks referred to by the dops pointer. - - -A possible extended device structure which uses the connection controlling -encapsulation services could look like this: - -struct concap_device{ - struct net_device net_dev; - struct my_priv /* device->local stuff */ - /* the my_priv struct might contain a - struct concap_device_ops *dops; - to provide the device specific callbacks - */ - struct concap_proto *cprot; /* callbacks provided by protocol */ -}; - - - -Misc Thoughts -============= - -The concept of the concap proto might help to reuse protocol code and -reduce the complexity of certain network interface implementations. -The trade off is that it introduces yet another procedure call layer -when processing the protocol. This has of course some impact on -performance. However, typically the concap interface will be used by -devices attached to slow lines (like telephone, isdn, leased synchronous -lines). For such slow lines, the overhead is probably negligible. -This might no longer hold for certain high speed WAN links (like -ATM). - - -If general linux network interfaces explicitly supported concap -protocols (e.g. by a member struct concap_proto* in struct net_device) -then the interface of the service function could be changed -by passing a pointer of type (struct net_device*) instead of -type (struct concap_proto*). Doing so would make many of the service -functions compatible to network device support functions. - -e.g. instead of the concap protocol's service function - - int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); - -we could have - - int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb); - -As this is compatible to the dev->hard_start_xmit() method, the device -driver could directly register the concap protocol's encap_and_xmit() -function as its hard_start_xmit() method. This would eliminate one -procedure call layer. - - -The device's data request function could also be defined as - - int (*data_req)(struct net_device *ndev, struct sk_buff *skb); - -This might even allow for some protocol stacking. And the network -interface might even register the same data_req() function directly -as its hard_start_xmit() method when a zero layer encapsulation -protocol is configured. Thus, eliminating the performance penalty -of the concap interface when a trivial concap protocol is used. -Nevertheless, the device remains able to support encapsulation -protocol configuration. - diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion deleted file mode 100644 index bddcd5fb86ff..000000000000 --- a/Documentation/isdn/README.diversion +++ /dev/null @@ -1,127 +0,0 @@ -The isdn diversion services are a supporting module working together with -the isdn4linux and the HiSax module for passive cards. -Active cards, TAs and cards using a own or other driver than the HiSax -module need to be adapted to the HL<->LL interface described in a separate -document. The diversion services may be used with all cards supported by -the HiSax driver. -The diversion kernel interface and controlling tool divertctrl were written -by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the -GNU General Public License. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -Table of contents -================= - -1. Features of the i4l diversion services - (Or what can the i4l diversion services do for me) - -2. Required hard- and software - -3. Compiling, installing and loading/unloading the module - Tracing calling and diversion information - -4. Tracing calling and diversion information - -5. Format of the divert device ASCII output - - -1. Features of the i4l diversion services - (Or what can the i4l diversion services do for me) - - The i4l diversion services offers call forwarding and logging normally - only supported by isdn phones. Incoming calls may be diverted - unconditionally (CFU), when not reachable (CFNR) or on busy condition - (CFB). - The diversions may be invoked statically in the providers exchange - as normally done by isdn phones. In this case all incoming calls - with a special (or all) service identifiers are forwarded if the - forwarding reason is met. Activated static services may also be - interrogated (queried). - The i4l diversion services additionally offers a dynamic version of - call forwarding which is not preprogrammed inside the providers exchange - but dynamically activated by i4l. - In this case all incoming calls are checked by rules that may be - compared to the mechanism of ipfwadm or ipchains. If a given rule matches - the checking process is finished and the rule matching will be applied - to the call. - The rules include primary and secondary service identifiers, called - number and subaddress, callers number and subaddress and whether the rule - matches to all filtered calls or only those when all B-channel resources - are exhausted. - Actions that may be invoked by a rule are ignore, proceed, reject, - direct divert or delayed divert of a call. - All incoming calls matching a rule except the ignore rule a reported and - logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed - is selected the call will be held in a proceeding state (without ringing) - for a certain amount of time to let an external program or client decide - how to handle the call. - - -2. Required hard- and software - - For using the i4l diversion services the isdn line must be of a EURO/DSS1 - type. Additionally the i4l services only work together with the HiSax - driver for passive isdn cards. All HiSax supported cards may be used for - the diversion purposes. - The static diversion services require the provider having static services - CFU, CFNR, CFB activated on an MSN-line. The static services may not be - used on a point-to-point connection. Further the static services are only - available in some countries (for example germany). Countries requiring the - keypad protocol for activating static diversions (like the netherlands) are - not supported but may use the tty devices for this purpose. - The dynamic diversion services may be used in all countries if the provider - enables the feature CF (call forwarding). This should work on both MSN- and - point-to-point lines. - To add and delete rules the additional divertctrl program is needed. This - program is part of the isdn4kutils package. - -3. Compiling, installing and loading/unloading the module - Tracing calling and diversion information - - - To compile the i4l code with diversion support you need to say yes to the - DSS1 diversion services when selecting the i4l options in the kernel - config (menuconfig or config). - After having properly activated a make modules and make modules_install all - required modules will be correctly installed in the needed modules dirs. - As the diversion services are currently not included in the scripts of most - standard distributions you will have to add a "insmod dss1_divert" after - having loaded the global isdn module. - The module can be loaded without any command line parameters. - If the module is actually loaded and active may be checked with a - "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is - dynamically created by the diversion module and removed when the module is - unloaded. - - -4. Tracing calling and diversion information - - You also may put a "cat /proc/net/isdn/divert" in the background with the - output redirected to a file. Then all actions of the module are logged. - The divert file in the proc system may be opened more than once, so in - conjunction with inetd and a small remote client on other machines inside - your network incoming calls and reactions by the module may be shown on - every listening machine. - If a call is reported as proceeding an external program or client may - specify during a certain amount of time (normally 4 to 10 seconds) what - to do with that call. - To unload the module all open files to the device in the proc system must - be closed. Otherwise the module (and isdn.o) may not be unloaded. - -5. Format of the divert device ASCII output - - To be done later - diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax deleted file mode 100644 index 5314958a8a6e..000000000000 --- a/Documentation/isdn/README.fax +++ /dev/null @@ -1,45 +0,0 @@ - -Fax with isdn4linux -=================== - -When enabled during kernel configuration, the tty emulator -of the ISDN subsystem is capable of the Fax Class 2 commands. - -This only makes sense under the following conditions : - -- You need the commands as dummy, because you are using - hylafax (with patch) for AVM capi. -- You want to use the fax capabilities of your isdn-card. - (supported cards are listed below) - - -NOTE: This implementation does *not* support fax with passive - ISDN-cards (known as softfax). The low-level driver of - the ISDN-card and/or the card itself must support this. - - -Supported ISDN-Cards --------------------- - -Eicon DIVA Server BRI/PCI - - full support with both B-channels. - -Eicon DIVA Server 4BRI/PCI - - full support with all B-channels. - -Eicon DIVA Server PRI/PCI - - full support on amount of B-channels - depending on DSPs on board. - - - -The command set is known as Class 2 (not Class 2.0) and -can be activated by AT+FCLASS=2 - - -The interface between the link-level-module and the hardware-level driver -is described in the files INTERFACE.fax and INTERFACE. - -Armin -mac@melware.de - diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci deleted file mode 100644 index e8a4ef0226e8..000000000000 --- a/Documentation/isdn/README.hfc-pci +++ /dev/null @@ -1,41 +0,0 @@ -The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used -for many OEM cards using this chips. -Additionally the driver has a special feature which makes it possible -to read the echo-channel of the isdn bus. So all frames in both directions -may be logged. -When the echo logging feature is used the number of available B-channels -for a HFC-PCI card is reduced to 1. Of course this is only relevant to -the card, not to the isdn line. -To activate the echo mode the following ioctls must be entered: - -hisaxctrl 10 1 - -This reduces the available channels to 1. There must not be open connections -through this card when entering the command. -And then: - -hisaxctrl 12 1 - -This enables the echo mode. If Hex logging is activated the isdnctrlx -devices show a output with a line beginning of HEX: for the providers -exchange and ECHO: for isdn devices sending to the provider. - -If more than one HFC-PCI cards are installed, a specific card may be selected -at the hisax module load command line. Supply the load command with the desired -IO-address of the desired card. -Example: -There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400 -and 0xdc00 -If you want to use the card at 0xd400 standalone you should supply the insmod -or depmod with type=35 io=0xd400. -If you want to use all three cards, but the order needs to be at 0xdc00,0xd400, -0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00 -Then the desired card will be the initialised in the desired order. -If the io parameter is used the io addresses of all used cards should be -supplied else the parameter is assumed 0 and a auto search for a free card is -invoked which may not give the wanted result. - -Comments and reports to werner@isdn4linux.de or werner@isdn-development.de - - - diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp deleted file mode 100644 index 27d260095cce..000000000000 --- a/Documentation/isdn/README.syncppp +++ /dev/null @@ -1,58 +0,0 @@ -Some additional information for setting up a syncPPP -connection using network interfaces. ---------------------------------------------------------------- - -You need one thing beside the isdn4linux package: - - a patched pppd .. (I called it ipppd to show the difference) - -Compiling isdn4linux with sync PPP: ------------------------------------ -To compile isdn4linux with the sync PPP part, you have -to answer the appropriate question when doing a "make config" -Don't forget to load the slhc.o -module before the isdn.o module, if VJ-compression support -is not compiled into your kernel. (e.g if you have no PPP or -CSLIP in the kernel) - -Using isdn4linux with sync PPP: -------------------------------- -Sync PPP is just another encapsulation for isdn4linux. The -name to enable sync PPP encapsulation is 'syncppp' .. e.g: - - /sbin/isdnctrl encap ippp0 syncppp - -The name of the interface is here 'ippp0'. You need -one interface with the name 'ippp0' to saturate the -ipppd, which checks the ppp version via this interface. -Currently, all devices must have the name ipppX where -'X' is a decimal value. - -To set up a PPP connection you need the ipppd .. You must start -the ipppd once after installing the modules. The ipppd -communicates with the isdn4linux link-level driver using the -/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle -all devices at once. If you want to use two PPP connections -at the same time, you have to connect the ipppd to two -devices .. and so on. -I've implemented one additional option for the ipppd: - 'useifip' will get (if set to not 0.0.0.0) the IP address - for the negotiation from the attached network-interface. -(also: ipppd will try to negotiate pointopoint IP as remote IP) -You must disable BSD-compression, this implementation can't -handle compressed packets. - -Check the etc/rc.isdn.syncppp in the isdn4kernel-util package -for an example setup script. - -To use the MPPP stuff, you must configure a slave device -with isdn4linux. Now call the ipppd with the '+mp' option. -To increase the number of links, you must use the -'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is -an example script) - -enjoy it, - michael - - - diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25 deleted file mode 100644 index e561a77c4e22..000000000000 --- a/Documentation/isdn/README.x25 +++ /dev/null @@ -1,184 +0,0 @@ - -X.25 support within isdn4linux -============================== - -This is alpha/beta test code. Use it completely at your own risk. -As new versions appear, the stuff described here might suddenly change -or become invalid without notice. - -Keep in mind: - -You are using several new parts of the 2.2.x kernel series which -have not been tested in a large scale. Therefore, you might encounter -more bugs as usual. - -- If you connect to an X.25 neighbour not operated by yourself, ASK the - other side first. Be prepared that bugs in the protocol implementation - might result in problems. - -- This implementation has never wiped out my whole hard disk yet. But as - this is experimental code, don't blame me if that happened to you. - Backing up important data will never harm. - -- Monitor your isdn connections while using this software. This should - prevent you from undesired phone bills in case of driver problems. - - - - -How to configure the kernel -=========================== - -The ITU-T (former CCITT) X.25 network protocol layer has been implemented -in the Linux source tree since version 2.1.16. The isdn subsystem might be -useful to run X.25 on top of ISDN. If you want to try it, select - - "CCITT X.25 Packet Layer" - -from the networking options as well as - - "ISDN Support" and "X.25 PLP on Top of ISDN" - -from the ISDN subsystem options when you configure your kernel for -compilation. You currently also need to enable -"Prompt for development and/or incomplete code/drivers" from the -"Code maturity level options" menu. For the x25trace utility to work -you also need to enable "Packet socket". - -For local testing it is also recommended to enable the isdnloop driver -from the isdn subsystem's configuration menu. - -For testing, it is recommended that all isdn drivers and the X.25 PLP -protocol are compiled as loadable modules. Like this, you can recover -from certain errors by simply unloading and reloading the modules. - - - -What's it for? How to use it? -============================= - -X.25 on top of isdn might be useful with two different scenarios: - -- You might want to access a public X.25 data network from your Linux box. - You can use i4l if you were physically connected to the X.25 switch - by an ISDN B-channel (leased line as well as dial up connection should - work). - - This corresponds to ITU-T recommendation X.31 Case A (circuit-mode - access to PSPDN [packet switched public data network]). - - NOTE: X.31 also covers a Case B (access to PSPDN via virtual - circuit / packet mode service). The latter mode (which in theory - also allows using the D-channel) is not supported by isdn4linux. - It should however be possible to establish such packet mode connections - with certain active isdn cards provided that the firmware supports X.31 - and the driver exports this functionality to the user. Currently, - the AVM B1 driver is the only driver which does so. (It should be - possible to access D-channel X.31 with active AVM cards using the - CAPI interface of the AVM-B1 driver). - -- Or you might want to operate certain ISDN teleservices on your linux - box. A lot of those teleservices run on top of the ISO-8208 - (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the - same as ITU-T X.25. - - Popular candidates of such teleservices are EUROfile transfer or any - teleservice applying ITU-T recommendation T.90. - -To use the X.25 protocol on top of isdn, just create an isdn network -interface as usual, configure your own and/or peer's ISDN numbers, -and choose x25iface encapsulation by - - isdnctrl encap x25iface. - -Once encap is set like this, the device can be used by the X.25 packet layer. - -All the stuff needed for X.25 is implemented inside the isdn link -level (mainly isdn_net.c and some new source files). Thus, it should -work with every existing HL driver. I was able to successfully open X.25 -connections on top of the isdnloop driver and the hisax driver. -"x25iface"-encapsulation bypasses demand dialing. Dialing will be -initiated when the upper (X.25 packet) layer requests the lapb datalink to -be established. But hangup timeout is still active. Whenever a hangup -occurs, all existing X.25 connections on that link will be cleared -It is recommended to use sufficiently large hangup-timeouts for the -isdn interfaces. - - -In order to set up a conforming protocol stack you also need to -specify the proper l2_prot parameter: - -To operate in ISO-8208 X.25 DTE-DTE mode, use - - isdnctrl l2_prot x75i - -To access an X.25 network switch via isdn (your linux box is the DTE), use - - isdnctrl l2_prot x25dte - -To mimic an X.25 network switch (DCE side of the connection), use - - isdnctrl l2_prot x25dce - -However, x25dte or x25dce is currently not supported by any real HL -level driver. The main difference between x75i and x25dte/dce is that -x25d[tc]e uses fixed lap_b addresses. With x75i, the side which -initiates the isdn connection uses the DTE's lap_b address while the -called side used the DCE's lap_b address. Thus, l2_prot x75i might -probably work if you access a public X.25 network as long as the -corresponding isdn connection is set up by you. At least one test -was successful to connect via isdn4linux to an X.25 switch using this -trick. At the switch side, a terminal adapter X.21 was used to connect -it to the isdn. - - -How to set up a test installation? -================================== - -To test X.25 on top of isdn, you need to get - -- a recent version of the "isdnctrl" program that supports setting the new - X.25 specific parameters. - -- the x25-utils-2.X package from - ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-* - (don't confuse the x25-utils with the ax25-utils) - -- an application program that uses linux PF_X25 sockets (some are - contained in the x25-util package). - -Before compiling the user level utilities make sure that the compiler/ -preprocessor will fetch the proper kernel header files of this kernel -source tree. Either make /usr/include/linux a symbolic link pointing to -this kernel's include/linux directory or set the appropriate compiler flags. - -When all drivers and interfaces are loaded and configured you need to -ifconfig the network interfaces up and add X.25-routes to them. Use -the usual ifconfig tool. - -ifconfig up - -But a special x25route tool (distributed with the x25-util package) -is needed to set up X.25 routes. I.e. - -x25route add 01 - -will cause all x.25 connections to the destination X.25-address -"01" to be routed to your created isdn network interface. - -There are currently no real X.25 applications available. However, for -tests, the x25-utils package contains a modified version of telnet -and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can -use those for your first tests. Furthermore, you might check -ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some -alpha-test implementation ("eftp4linux") of the EUROfile transfer -protocol. - -The scripts distributed with the eftp4linux test releases might also -provide useful examples for setting up X.25 on top of isdn. - -The x25-utility package also contains an x25trace tool that can be -used to monitor X.25 packets received by the network interfaces. -The /proc/net/x25* files also contain useful information. - -- Henner diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ deleted file mode 100644 index 3257a4bc0786..000000000000 --- a/Documentation/isdn/syncPPP.FAQ +++ /dev/null @@ -1,224 +0,0 @@ -simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' -------------------------------------------------------------------- - -Q01: what's pppd, ipppd, syncPPP, asyncPPP ?? -Q02: error message "this system lacks PPP support" -Q03: strange information using 'ifconfig' -Q04: MPPP?? What's that and how can I use it ... -Q05: I tried MPPP but it doesn't work -Q06: can I use asynchronous PPP encapsulation with network devices -Q07: A SunISDN machine can't connect to my i4l system -Q08: I wanna talk to several machines, which need different configs -Q09: Starting the ipppd, I get only error messages from i4l -Q10: I wanna use dynamic IP address assignment -Q11: I can't connect. How can I check where the problem is. -Q12: How can I reduce login delay? - -------------------------------------------------------------------- - -Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ? - what should I use? -A: The pppd is for asynchronous PPP .. asynchronous means - here, the framing is character based. (e.g when - using ttyI* or tty* devices) - - The ipppd handles PPP packets coming in HDLC - frames (bit based protocol) ... The PPP driver - in isdn4linux pushes all IP packets direct - to the network layer and all PPP protocol - frames to the /dev/ippp* device. - So, the ipppd is a simple external network - protocol handler. - - If you login into a remote machine using the - /dev/ttyI* devices and then enable PPP on the - remote terminal server -> use the 'old' pppd - - If your remote side immediately starts to send - frames ... you probably connect to a - syncPPP machine .. use the network device part - of isdn4linux with the 'syncppp' encapsulation - and make sure, that the ipppd is running and - connected to at least one /dev/ippp*. Check the - isdn4linux manual on how to configure a network device. - --- - -Q02: when I start the ipppd .. I only get the - error message "this system lacks PPP support" -A: check that at least the device 'ippp0' exists. - (you can check this e.g with the program 'ifconfig') - The ipppd NEEDS this device under THIS name .. - If this device doesn't exists, use: - isdnctrl addif ippp0 - isdnctrl encap ippp0 syncppp - ... (see isdn4linux doc for more) ... -A: Maybe you have compiled the ipppd with another - kernel source tree than the kernel you currently - run ... - --- - -Q03: when I list the netdevices with ifconfig I see, that - my ISDN interface has a HWaddr and IRQ=0 and Base - address = 0 -A: The device is a fake ethernet device .. ignore IRQ and baseaddr - You need the HWaddr only for ethernet encapsulation. - --- - -Q04: MPPP?? What's that and how can I use it ... - -A: MPPP or MP or MPP (Warning: MP is also an - acronym for 'Multi Processor') stands for - Multi Point to Point and means bundling - of several channels to one logical stream. - To enable MPPP negotiation you must call the - ipppd with the '+mp' option. - You must also configure a slave device for - every additional channel. (see the i4l manual - for more) - To use channel bundling you must first activate - the 'master' or initial call. Now you can add - the slave channels with the command: - isdnctrl addlink - e.g: - isdnctrl addlink ippp0 - This is different from other encapsulations of - isdn4linux! With syncPPP, there is no automatic - activation of slave devices. - --- - -Q05: I tried MPPP but it doesn't work .. the ipppd - writes in the debug log something like: - .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ... - .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ... - -A: you forgot to compile MPPP/RFC1717 support into the - ISDN Subsystem. Recompile with this option enabled. - --- - -Q06: can I use asynchronous PPP encapsulation - over the network interface of isdn4linux .. - -A: No .. that's not possible .. Use the standard - PPP package over the /dev/ttyI* devices. You - must not use the ipppd for this. - --- - -Q07: A SunISDN machine tries to connect my i4l system, - which doesn't work. - Checking the debug log I just saw garbage like: -!![ ... fill in the line ... ]!! - -A: The Sun tries to talk asynchronous PPP ... i4l - can't understand this ... try to use the ttyI* - devices with the standard PPP/pppd package - -A: (from Alexanter Strauss: ) -!![ ... fill in mail ]!! - --- - -Q08: I wanna talk to remote machines, which need - a different configuration. The only way - I found to do this is to kill the ipppd and - start a new one with another config to connect - to the second machine. - -A: you must bind a network interface explicitly to - an ippp device, where you can connect a (for this - interface) individually configured ipppd. - --- - -Q09: When I start the ipppd I only get error messages - from the i4l driver .. - -A: When starting, the ipppd calls functions which may - trigger a network packet. (e.g gethostbyname()). - Without the ipppd (at this moment, it is not - fully started) we can't handle this network request. - Try to configure hostnames necessary for the ipppd - in your local /etc/hosts file or in a way, that - your system can resolve it without using an - isdn/ippp network-interface. - --- - -Q10: I wanna use dynamic IP address assignment ... How - must I configure the network device. - -A: At least you must have a route which forwards - a packet to the ippp network-interface to trigger - the dial-on-demand. - A default route to the ippp-interface will work. - Now you must choose a dummy IP address for your - interface. - If for some reason you can't set the default - route to the ippp interface, you may take any - address of the subnet from which you expect your - dynamic IP number and set a 'network route' for - this subnet to the ippp interface. - To allow overriding of the dummy address you - must call the ipppd with the 'ipcp-accept-local' option. - -A: You must know, how the ipppd gets the addresses it wanna - configure. If you don't give any option, the ipppd - tries to negotiate the local host address! - With the option 'noipdefault' it requests an address - from the remote machine. With 'useifip' it gets the - addresses from the net interface. Or you set the address - on the option line with the option. - Note: the IP address of the remote machine must be configured - locally or the remote machine must send it in an IPCP request. - If your side doesn't know the IP address after negotiation, it - closes the connection! - You must allow overriding of address with the 'ipcp-accept-*' - options, if you have set your own or the remote address - explicitly. - -A: Maybe you try these options .. e.g: - - /sbin/ipppd :$REMOTE noipdefault /dev/ippp0 - - where REMOTE must be the address of the remote machine (the - machine, which gives you your address) - --- - -Q11: I can't connect. How can I check where the problem is. - -A: A good help log is the debug output from the ipppd... - Check whether you can find there: - - only a few LCP-conf-req SENT messages (less then 10) - and then a Term-REQ: - -> check whether your ISDN card is well configured - it seems, that your machine doesn't dial - (IRQ,IO,Proto, etc problems) - Configure your ISDN card to print debug messages and - check the /dev/isdnctrl output next time. There - you can see, whether there is activity on the card/line. - - there are at least a few RECV messages in the log: - -> fine: your card is dialing and your remote machine - tries to talk with you. Maybe only a missing - authentication. Check your ipppd configuration again. - - the ipppd exits for some reason: - -> not good ... check /var/adm/syslog and /var/adm/daemon. - Could be a bug in the ipppd. - --- - -Q12: How can I reduce login delay? - -A: Log a login session ('debug' log) and check which options - your remote side rejects. Next time configure your ipppd - to not negotiate these options. Another 'side effect' is, that - this increases redundancy. (e.g your remote side is buggy and - rejects options in a wrong way). - - - diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 18735dc460a0..111636ad1bad 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -23,8 +23,8 @@ running, the suggested command should tell you. Again, keep in mind that this list assumes you are already functionally running a Linux kernel. Also, not all tools are necessary on all -systems; obviously, if you don't have any ISDN hardware, for example, -you probably needn't concern yourself with isdn4k-utils. +systems; obviously, if you don't have any PC Card hardware, for example, +you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== Program Minimal version Command to check the version @@ -45,7 +45,6 @@ btrfs-progs 0.18 btrfsck pcmciautils 004 pccardctl -V quota-tools 3.09 quota -V PPP 2.4.0 pppd --version -isdn4k-utils 3.1pre1 isdnctrl 2>&1|grep version nfs-utils 1.0.5 showmount --version procps 3.2.0 ps --version oprofile 0.9 oprofiled --version @@ -279,12 +278,6 @@ which can be made by:: as root. -Isdn4k-utils ------------- - -Due to changes in the length of the phone number field, isdn4k-utils -needs to be recompiled or (preferably) upgraded. - NFS-utils --------- @@ -448,11 +441,6 @@ PPP - -Isdn4k-utils ------------- - -- - NFS-utils --------- -- cgit From 96ef07f79ae8644ab9c277e3f2f4fb04a54be43d Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 1 May 2019 17:14:05 -0700 Subject: dt-bindings: phy: Add binding for Qualcomm PCIe2 PHY The Qualcomm PCIe2 PHY is a Synopsys based PCIe PHY found in a number of Qualcomm platforms, add a binding to describe this. Signed-off-by: Bjorn Andersson Reviewed-by: Rob Herring Signed-off-by: Kishon Vijay Abraham I --- .../devicetree/bindings/phy/qcom-pcie2-phy.txt | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt new file mode 100644 index 000000000000..30064253f290 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt @@ -0,0 +1,42 @@ +Qualcomm PCIe2 PHY controller +============================= + +The Qualcomm PCIe2 PHY is a Synopsys based phy found in a number of Qualcomm +platforms. + +Required properties: + - compatible: compatible list, should be: + "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy" + + - reg: offset and length of the PHY register set. + - #phy-cells: must be 0. + + - clocks: a clock-specifier pair for the "pipe" clock + + - vdda-vp-supply: phandle to low voltage regulator + - vdda-vph-supply: phandle to high voltage regulator + + - resets: reset-specifier pairs for the "phy" and "pipe" resets + - reset-names: list of resets, should contain: + "phy" and "pipe" + + - clock-output-names: name of the outgoing clock signal from the PHY PLL + - #clock-cells: must be 0 + +Example: + phy@7786000 { + compatible = "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy"; + reg = <0x07786000 0xb8>; + + clocks = <&gcc GCC_PCIE_0_PIPE_CLK>; + resets = <&gcc GCC_PCIEPHY_0_PHY_BCR>, + <&gcc GCC_PCIE_0_PIPE_ARES>; + reset-names = "phy", "pipe"; + + vdda-vp-supply = <&vreg_l3_1p05>; + vdda-vph-supply = <&vreg_l5_1p8>; + + clock-output-names = "pcie_0_pipe_clk"; + #clock-cells = <0>; + #phy-cells = <0>; + }; -- cgit From 6c01edd395a7cc7bb82333e953992eb0e76b1c35 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 31 May 2019 10:02:11 -0600 Subject: docs: look for sphinx-pre-install in the source tree Recent makefile changes included an invocation of ./scripts/sphinx-pre-install. Unfortunately, that fails when a separate build directory is in use with: /bin/bash: ./scripts/sphinx-pre-install: No such file or directory Use $(srctree) to fully specify the location of this script. Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/Makefile b/Documentation/Makefile index 85d3cfafd77c..2edd03b1dad6 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -23,7 +23,7 @@ ifeq ($(HAVE_SPHINX),0) .DEFAULT: $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) @echo - @./scripts/sphinx-pre-install + @$(srctree)/scripts/sphinx-pre-install @echo " SKIP Sphinx $@ target." else # HAVE_SPHINX -- cgit From a5e112e6424adb77d953eac20e6936b952fd6b32 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 May 2019 12:37:17 -0700 Subject: cgroup: add cgroup_parse_float() cgroup already uses floating point for percent[ile] numbers and there are several controllers which want to take them as input. Add a generic parse helper to handle inputs. Update the interface convention documentation about the use of percentage numbers. While at it, also clarify the default time unit. Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 88e746074252..73b0c0d8df31 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -696,6 +696,12 @@ Conventions informational files on the root cgroup which end up showing global information available elsewhere shouldn't exist. +- The default time unit is microseconds. If a different unit is ever + used, an explicit unit suffix must be present. + +- A parts-per quantity should use a percentage decimal with at least + two digit fractional part - e.g. 13.40. + - If a controller implements weight based resource distribution, its interface file should be named "weight" and have the range [1, 10000] with 100 as the default. The values are chosen to allow -- cgit From 0ff9848067b7b950a4ed70de7f5028600a2157e3 Mon Sep 17 00:00:00 2001 From: Ke Wu Date: Thu, 30 May 2019 12:22:08 -0700 Subject: security/loadpin: Allow to exclude specific file types Linux kernel already provide MODULE_SIG and KEXEC_VERIFY_SIG to make sure loaded kernel module and kernel image are trusted. This patch adds a kernel command line option "loadpin.exclude" which allows to exclude specific file types from LoadPin. This is useful when people want to use different mechanisms to verify module and kernel image while still use LoadPin to protect the integrity of other files kernel loads. Signed-off-by: Ke Wu Reviewed-by: James Morris [kees: fix array size issue reported by Coverity via Colin Ian King] Signed-off-by: Kees Cook --- Documentation/admin-guide/LSM/LoadPin.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/LSM/LoadPin.rst b/Documentation/admin-guide/LSM/LoadPin.rst index 32070762d24c..716ad9b23c9a 100644 --- a/Documentation/admin-guide/LSM/LoadPin.rst +++ b/Documentation/admin-guide/LSM/LoadPin.rst @@ -19,3 +19,13 @@ block device backing the filesystem is not read-only, a sysctl is created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having a mutable filesystem means pinning is mutable too, but having the sysctl allows for easy testing on systems with a mutable filesystem.) + +It's also possible to exclude specific file types from LoadPin using kernel +command line option "``loadpin.exclude``". By default, all files are +included, but they can be excluded using kernel command line option such +as "``loadpin.exclude=kernel-module,kexec-image``". This allows to use +different mechanisms such as ``CONFIG_MODULE_SIG`` and +``CONFIG_KEXEC_VERIFY_SIG`` to verify kernel module and kernel image while +still use LoadPin to protect the integrity of other files kernel loads. The +full list of valid file types can be found in ``kernel_read_file_str`` +defined in ``include/linux/fs.h``. -- cgit From c231c22a989af95fae3e75cac9d4511e0fe79377 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 May 2019 21:23:18 +0100 Subject: bpf: doc: update answer for 32-bit subregister question There has been quite a few progress around the two steps mentioned in the answer to the following question: Q: BPF 32-bit subregister requirements This patch updates the answer to reflect what has been done. v2: - Add missing full stop. (Song Liu) - Minor tweak on one sentence. (Song Liu) v1: - Integrated rephrase from Quentin and Jakub Reviewed-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_design_QA.rst | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index cb402c59eca5..12a246fcf6cb 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -172,11 +172,31 @@ registers which makes BPF inefficient virtual machine for 32-bit CPU architectures and 32-bit HW accelerators. Can true 32-bit registers be added to BPF in the future? -A: NO. The first thing to improve performance on 32-bit archs is to teach -LLVM to generate code that uses 32-bit subregisters. Then second step -is to teach verifier to mark operations where zero-ing upper bits -is unnecessary. Then JITs can take advantage of those markings and -drastically reduce size of generated code and improve performance. +A: NO. + +But some optimizations on zero-ing the upper 32 bits for BPF registers are +available, and can be leveraged to improve the performance of JITed BPF +programs for 32-bit architectures. + +Starting with version 7, LLVM is able to generate instructions that operate +on 32-bit subregisters, provided the option -mattr=+alu32 is passed for +compiling a program. Furthermore, the verifier can now mark the +instructions for which zero-ing the upper bits of the destination register +is required, and insert an explicit zero-extension (zext) instruction +(a mov32 variant). This means that for architectures without zext hardware +support, the JIT back-ends do not need to clear the upper bits for +subregisters written by alu32 instructions or narrow loads. Instead, the +back-ends simply need to support code generation for that mov32 variant, +and to overwrite bpf_jit_needs_zext() to make it return "true" (in order to +enable zext insertion in the verifier). + +Note that it is possible for a JIT back-end to have partial hardware +support for zext. In that case, if verifier zext insertion is enabled, +it could lead to the insertion of unnecessary zext instructions. Such +instructions could be removed by creating a simple peephole inside the JIT +back-end: if one instruction has hardware support for zext and if the next +instruction is an explicit zext, then the latter can be skipped when doing +the code generation. Q: Does BPF have a stable ABI? ------------------------------ -- cgit From 2585a584f8444e10a5a9e234c0c38b81dfe4370a Mon Sep 17 00:00:00 2001 From: Krishna Yarlagadda Date: Thu, 16 May 2019 17:23:11 +0530 Subject: pinctrl: Add Tegra194 pinctrl DT bindings Add binding doc for Tegra 194 pinctrl driver. Signed-off-by: Krishna Yarlagadda Tested-by: Vidya Sagar Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- .../bindings/pinctrl/nvidia,tegra194-pinmux.txt | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt new file mode 100644 index 000000000000..8763f448c376 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt @@ -0,0 +1,107 @@ +NVIDIA Tegra194 pinmux controller + +Required properties: +- compatible: "nvidia,tegra194-pinmux" +- reg: Should contain a list of base address and size pairs for: + - first entry: The APB_MISC_GP_*_PADCTRL registers (pad control) + - second entry: The PINMUX_AUX_* registers (pinmux) + +Please refer to pinctrl-bindings.txt in this directory for details of the +common pinctrl bindings used by client devices, including the meaning of the +phrase "pin configuration node". + +Tegra's pin configuration nodes act as a container for an arbitrary number of +subnodes. Each of these subnodes represents some desired configuration for a +pin, a group, or a list of pins or groups. This configuration can include the +mux function to select on those pin(s)/group(s), and various pin configuration +parameters, such as pull-up, tristate, drive strength, etc. + +See the TRM to determine which properties and values apply to each pin/group. +Macro values for property values are defined in +include/dt-binding/pinctrl/pinctrl-tegra.h. + +Required subnode-properties: +- nvidia,pins : An array of strings. Each string contains the name of a pin or + group. Valid values for these names are listed below. + +Optional subnode-properties: +- nvidia,function: A string containing the name of the function to mux to the + pin or group. +- nvidia,pull: Integer, representing the pull-down/up to apply to the pin. + 0: none, 1: down, 2: up. +- nvidia,tristate: Integer. + 0: drive, 1: tristate. +- nvidia,enable-input: Integer. Enable the pin's input path. + enable :TEGRA_PIN_ENABLE and + disable or output only: TEGRA_PIN_DISABLE. +- nvidia,open-drain: Integer. + enable: TEGRA_PIN_ENABLE. + disable: TEGRA_PIN_DISABLE. +- nvidia,lock: Integer. Lock the pin configuration against further changes + until reset. + enable: TEGRA_PIN_ENABLE. + disable: TEGRA_PIN_DISABLE. +- nvidia,io-hv: Integer. Select high-voltage receivers. + normal: TEGRA_PIN_DISABLE + high: TEGRA_PIN_ENABLE +- nvidia,schmitt: Integer. Enables Schmitt Trigger on the input. + normal: TEGRA_PIN_DISABLE + high: TEGRA_PIN_ENABLE +- nvidia,drive-type: Integer. Valid range 0...3. +- nvidia,pull-down-strength: Integer. Controls drive strength. 0 is weakest. + The range of valid values depends on the pingroup. See "CAL_DRVDN" in the + Tegra TRM. +- nvidia,pull-up-strength: Integer. Controls drive strength. 0 is weakest. + The range of valid values depends on the pingroup. See "CAL_DRVUP" in the + Tegra TRM. + +Valid values for pin and group names (nvidia,pin) are: + + These correspond to Tegra PADCTL_* (pinmux) registers. + + Mux groups: + + These correspond to Tegra PADCTL_* (pinmux) registers. Any property + that exists in those registers may be set for the following pin names. + + pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1 + + Drive groups: + + These registers controls a single pin for which a mux group exists. + See the list above for the pin name to use when configuring the pinmux. + + pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1 + +Valid values for nvidia,functions are: + + pe5 + +Power Domain: + pex_l5_clkreq_n_pgg0 and pex_l5_rst_n_pgg1 are part of PCIE C5 power + partition. Client devices must enable this partition before accessing + these pins here. + + +Example: + + tegra_pinctrl: pinmux: pinmux@2430000 { + compatible = "nvidia,tegra194-pinmux"; + reg = <0x2430000 0x17000 + 0xc300000 0x4000>; + + pinctrl-names = "pex_rst"; + pinctrl-0 = <&pex_rst_c5_out_state>; + + pex_rst_c5_out_state: pex_rst_c5_out { + pex_rst { + nvidia,pins = "pex_l5_rst_n_pgg1"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + }; -- cgit From c01fbbc83f42748b3ed094497933601e6c9e0a03 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:18 +0800 Subject: locking/lockdep: Add description and explanation in lockdep design doc More words are added to lockdep design document regarding key concepts, which should help people without lockdep experience read and understand lockdep reports. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-3-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- Documentation/locking/lockdep-design.txt | 79 ++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index 39fae143c9cb..ae65758383ea 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -15,34 +15,48 @@ tens of thousands of) instantiations. For example a lock in the inode struct is one class, while each inode has its own instantiation of that lock class. -The validator tracks the 'state' of lock-classes, and it tracks -dependencies between different lock-classes. The validator maintains a -rolling proof that the state and the dependencies are correct. - -Unlike an lock instantiation, the lock-class itself never goes away: when -a lock-class is used for the first time after bootup it gets registered, -and all subsequent uses of that lock-class will be attached to this -lock-class. +The validator tracks the 'usage state' of lock-classes, and it tracks +the dependencies between different lock-classes. Lock usage indicates +how a lock is used with regard to its IRQ contexts, while lock +dependency can be understood as lock order, where L1 -> L2 suggests that +a task is attempting to acquire L2 while holding L1. From lockdep's +perspective, the two locks (L1 and L2) are not necessarily related; that +dependency just means the order ever happened. The validator maintains a +continuing effort to prove lock usages and dependencies are correct or +the validator will shoot a splat if incorrect. + +A lock-class's behavior is constructed by its instances collectively: +when the first instance of a lock-class is used after bootup the class +gets registered, then all (subsequent) instances will be mapped to the +class and hence their usages and dependecies will contribute to those of +the class. A lock-class does not go away when a lock instance does, but +it can be removed if the memory space of the lock class (static or +dynamic) is reclaimed, this happens for example when a module is +unloaded or a workqueue is destroyed. State ----- -The validator tracks lock-class usage history into 4 * nSTATEs + 1 separate -state bits: +The validator tracks lock-class usage history and divides the usage into +(4 usages * n STATEs + 1) categories: +where the 4 usages can be: - 'ever held in STATE context' - 'ever held as readlock in STATE context' - 'ever held with STATE enabled' - 'ever held as readlock with STATE enabled' -Where STATE can be either one of (kernel/locking/lockdep_states.h) - - hardirq - - softirq +where the n STATEs are coded in kernel/locking/lockdep_states.h and as of +now they include: +- hardirq +- softirq +where the last 1 category is: - 'ever used' [ == !unused ] -When locking rules are violated, these state bits are presented in the -locking error messages, inside curlies. A contrived example: +When locking rules are violated, these usage bits are presented in the +locking error messages, inside curlies, with a total of 2 * n STATEs bits. +A contrived example: modprobe/2287 is trying to acquire lock: (&sio_locks[i].lock){-.-.}, at: [] mutex_lock+0x21/0x24 @@ -51,15 +65,44 @@ locking error messages, inside curlies. A contrived example: (&sio_locks[i].lock){-.-.}, at: [] mutex_lock+0x21/0x24 -The bit position indicates STATE, STATE-read, for each of the states listed -above, and the character displayed in each indicates: +For a given lock, the bit positions from left to right indicate the usage +of the lock and readlock (if exists), for each of the n STATEs listed +above respectively, and the character displayed at each bit position +indicates: '.' acquired while irqs disabled and not in irq context '-' acquired in irq context '+' acquired with irqs enabled '?' acquired in irq context with irqs enabled. -Unused mutexes cannot be part of the cause of an error. +The bits are illustrated with an example: + + (&sio_locks[i].lock){-.-.}, at: [] mutex_lock+0x21/0x24 + |||| + ||| \-> softirq disabled and not in softirq context + || \--> acquired in softirq context + | \---> hardirq disabled and not in hardirq context + \----> acquired in hardirq context + + +For a given STATE, whether the lock is ever acquired in that STATE +context and whether that STATE is enabled yields four possible cases as +shown in the table below. The bit character is able to indicate which +exact case is for the lock as of the reporting time. + + ------------------------------------------- + | | irq enabled | irq disabled | + |-------------------------------------------| + | ever in irq | ? | - | + |-------------------------------------------| + | never in irq | + | . | + ------------------------------------------- + +The character '-' suggests irq is disabled because if otherwise the +charactor '?' would have been shown instead. Similar deduction can be +applied for '+' too. + +Unused locks (e.g., mutexes) cannot be part of the cause of an error. Single-lock state rules: -- cgit From 1ac4ba5ed0114bcc146d5743d97df414af25c524 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:32 +0800 Subject: locking/lockdep: Add explanation to lock usage rules in lockdep design doc The irq usage and lock dependency rules that if violated a deacklock may happen are explained in more detail. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-17-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- Documentation/locking/lockdep-design.txt | 33 ++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index ae65758383ea..f189d130e543 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -108,14 +108,24 @@ Unused locks (e.g., mutexes) cannot be part of the cause of an error. Single-lock state rules: ------------------------ +A lock is irq-safe means it was ever used in an irq context, while a lock +is irq-unsafe means it was ever acquired with irq enabled. + A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The -following states are exclusive, and only one of them is allowed to be -set for any lock-class: +following states must be exclusive: only one of them is allowed to be set +for any lock-class based on its usage: + + or + or - and - and +This is because if a lock can be used in irq context (irq-safe) then it +cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a +deadlock may happen. For example, in the scenario that after this lock +was acquired but before released, if the context is interrupted this +lock will be attempted to acquire twice, which creates a deadlock, +referred to as lock recursion deadlock. -The validator detects and reports lock usage that violate these +The validator detects and reports lock usage that violates these single-lock state rules. Multi-lock dependency rules: @@ -124,15 +134,18 @@ Multi-lock dependency rules: The same lock-class must not be acquired twice, because this could lead to lock recursion deadlocks. -Furthermore, two locks may not be taken in different order: +Furthermore, two locks can not be taken in inverse order: -> -> -because this could lead to lock inversion deadlocks. (The validator -finds such dependencies in arbitrary complexity, i.e. there can be any -other locking sequence between the acquire-lock operations, the -validator will still track all dependencies between locks.) +because this could lead to a deadlock - referred to as lock inversion +deadlock - as attempts to acquire the two locks form a circle which +could lead to the two contexts waiting for each other permanently. The +validator will find such dependency circle in arbitrary complexity, +i.e., there can be any other locking sequence between the acquire-lock +operations; the validator will still find whether these locks can be +acquired in a circular fashion. Furthermore, the following usage based lock dependencies are not allowed between any two lock-classes: -- cgit From fff9b6c7d26943a8eb32b58364b7ec6b9369746a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 May 2019 13:52:31 +0200 Subject: Documentation/atomic_t.txt: Clarify pure non-rmw usage Clarify that pure non-RMW usage of atomic_t is pointless, there is nothing 'magical' about atomic_set() / atomic_read(). This is something that seems to confuse people, because I happen upon it semi-regularly. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190524115231.GN2623@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/atomic_t.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index dca3fb0554db..89eae7f6b360 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -81,9 +81,11 @@ Non-RMW ops: The non-RMW ops are (typically) regular LOADs and STOREs and are canonically implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and -smp_store_release() respectively. +smp_store_release() respectively. Therefore, if you find yourself only using +the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all +and are doing it wrong. -The one detail to this is that atomic_set{}() should be observable to the RMW +A subtle detail of atomic_set{}() is that it should be observable to the RMW ops. That is: C atomic-set -- cgit From c9c2c27d7ceca8c2856c5008f2002bddb384f518 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Apr 2019 15:46:55 +0200 Subject: debugfs: make debugfs_create_u32_array() return void The single user of debugfs_create_u32_array() does not care about the return value of it, so make it return void as there is no need to do anything with the return value. Signed-off-by: Greg Kroah-Hartman --- Documentation/filesystems/debugfs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt index 4a0a9c3f4af6..9e27c843d00e 100644 --- a/Documentation/filesystems/debugfs.txt +++ b/Documentation/filesystems/debugfs.txt @@ -169,7 +169,7 @@ byte offsets over a base for the register block. If you want to dump an u32 array in debugfs, you can create file with: - struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, u32 *array, u32 elements); -- cgit From 0864c9ce8fe83eadfd21b08e98997111d091660c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 3 Jun 2019 07:32:50 -0400 Subject: media: dt-bindings: Fix vendor-prefixes YAML Commit 8df39e16877f ("media: dt-bindings: media: Add vendor prefix for allegro") introduced a new devicetree binding vendors, however with an improper syntax making the resulting YAML impossible to parse. Let's fix this. Fixes: 8df39e16877f ("media: dt-bindings: media: Add vendor prefix for allegro") Signed-off-by: Maxime Ripard Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 7da11464991a..1acf806b62bf 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -49,7 +49,7 @@ patternProperties: description: Aeroflex Gaisler AB "^al,.*": description: Annapurna Labs - "^allegro,.*" + "^allegro,.*": description: Allegro DVT "^allo,.*": description: Allo.com -- cgit From 4d3aed70902f299ff2ed7048ef44f0d4d573d786 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 29 May 2019 17:49:06 -0700 Subject: f2fs: Add option to limit required GC for checkpoint=disable This extends the checkpoint option to allow checkpoint=disable:%u[%] This allows you to specify what how much of the disk you are willing to lose access to while mounting with checkpoint=disable. If the amount lost would be higher, the mount will return -EAGAIN. This can be given as a percent of total space, or in blocks. Currently, we need to run garbage collection until the amount of holes is smaller than the OVP space. With the new option, f2fs can mark space as unusable up front instead of requiring garbage collection until the number of holes is small enough. Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ Documentation/filesystems/f2fs.txt | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 91822ce25831..dca326e0ee3e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -243,3 +243,11 @@ Description: - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list - [h] means add/del hot file extension - [c] means add/del cold file extension + +What: /sys/fs/f2fs//unusable +Date April 2019 +Contact: "Daniel Rosenberg" +Description: + If checkpoint=disable, it displays the number of blocks that are unusable. + If checkpoint=enable it displays the enumber of blocks that would be unusable + if checkpoint=disable were to be set. diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 66aca042988e..bebd1be3ba49 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -214,11 +214,22 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix", non-atomic files likewise "nobarrier" mount option. test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. -checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable" +checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" to reenable checkpointing. Is enabled by default. While disabled, any unmounting or unexpected shutdowns will cause the filesystem contents to appear as they did when the filesystem was mounted with that option. + While mounting with checkpoint=disabled, the filesystem must + run garbage collection to ensure that all available space can + be used. If this takes too much time, the mount may return + EAGAIN. You may optionally add a value to indicate how much + of the disk you would be willing to temporarily give up to + avoid additional garbage collection. This can be given as a + number of blocks, or as a percent. For instance, mounting + with checkpoint=disable:100% would always succeed, but it may + hide up to all remaining free space. The actual space that + would be unusable can be viewed at /sys/fs/f2fs//unusable + This space is reclaimed once checkpoint=enable. ================================================================================ DEBUGFS ENTRIES @@ -396,6 +407,12 @@ Files in /sys/fs/f2fs/ current_reserved_blocks This shows # of blocks currently reserved. + unusable If checkpoint=disable, this shows the number of + blocks that are unusable. + If checkpoint=enable it shows the number of blocks + that would be unusable if checkpoint=disable were + to be set. + ================================================================================ USAGE ================================================================================ -- cgit From adf671ccd2aa067cd9786d6337dfe4a3d2cc2e8f Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Mon, 3 Jun 2019 17:43:24 +0300 Subject: net: ena: documentation: update ena.txt Small cosmetic changes to ena.txt Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- Documentation/networking/device_drivers/amazon/ena.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/amazon/ena.txt b/Documentation/networking/device_drivers/amazon/ena.txt index 2b4b6f57e549..1bb55c7b604c 100644 --- a/Documentation/networking/device_drivers/amazon/ena.txt +++ b/Documentation/networking/device_drivers/amazon/ena.txt @@ -73,7 +73,7 @@ operation. AQ is used for submitting management commands, and the results/responses are reported asynchronously through ACQ. -ENA introduces a very small set of management commands with room for +ENA introduces a small set of management commands with room for vendor-specific extensions. Most of the management operations are framed in a generic Get/Set feature command. @@ -202,11 +202,14 @@ delay value to each level. The user can enable/disable adaptive moderation, modify the interrupt delay table and restore its default values through sysfs. +RX copybreak: +============= The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK and can be configured by the ETHTOOL_STUNABLE command of the SIOCETHTOOL ioctl. SKB: +==== The driver-allocated SKB for frames received from Rx handling using NAPI context. The allocation method depends on the size of the packet. If the frame length is larger than rx_copybreak, napi_get_frags() -- cgit From fa0e5158c26ea63e1ac45befeda4d03e02f00d44 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 1 Jun 2019 08:03:10 +0800 Subject: dt-bindings: clock: mediatek: Add an extra required property to sgmiisys add an extra required property "mediatek,physpeed" to sgmiisys to determine link speed to match up the capability of the target PHY. Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt index 30cb645c0e54..f5518f26a914 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt @@ -9,6 +9,8 @@ Required Properties: - "mediatek,mt7622-sgmiisys", "syscon" - "mediatek,mt7629-sgmiisys", "syscon" - #clock-cells: Must be 1 +- mediatek,physpeed: Should be one of "auto", "1000" or "2500" to match up + the capability of the target PHY. The SGMIISYS controller uses the common clk binding from Documentation/devicetree/bindings/clock/clock-bindings.txt -- cgit From 3277fc683ae57f15b646b500c216e380821231c8 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 1 Jun 2019 08:03:11 +0800 Subject: dt-bindings: net: mediatek: Add support for MediaTek MT7629 SoC Add binding document for the ethernet on MT7629 SoC. Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/mediatek-net.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt index 503f2b9194e2..770ff98d4524 100644 --- a/Documentation/devicetree/bindings/net/mediatek-net.txt +++ b/Documentation/devicetree/bindings/net/mediatek-net.txt @@ -11,6 +11,7 @@ Required properties: "mediatek,mt2701-eth": for MT2701 SoC "mediatek,mt7623-eth", "mediatek,mt2701-eth": for MT7623 SoC "mediatek,mt7622-eth": for MT7622 SoC + "mediatek,mt7629-eth": for MT7629 SoC - reg: Address and length of the register set for the device - interrupts: Should contain the three frame engines interrupts in numeric order. These are fe_int0, fe_int1 and fe_int2. @@ -19,14 +20,23 @@ Required properties: "ethif", "esw", "gp2", "gp1" : For MT2701 and MT7623 SoC "ethif", "esw", "gp0", "gp1", "gp2", "sgmii_tx250m", "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii_ck", "eth2pll" : For MT7622 SoC + "ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "sgmii_tx250m", + "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii2_tx250m", + "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb", "sgmii_ck", + "eth2pll" : For MT7629 SoC. - power-domains: phandle to the power domain that the ethernet is part of - resets: Should contain phandles to the ethsys reset signals - reset-names: Should contain the names of reset signal listed in the resets property These are "fe", "gmac" and "ppe" - mediatek,ethsys: phandle to the syscon node that handles the port setup -- mediatek,sgmiisys: phandle to the syscon node that handles the SGMII setup - which is required for those SoCs equipped with SGMII such as MT7622 SoC. +- mediatek,infracfg: phandle to the syscon node that handles the path from + GMAC to PHY variants, which is required for MT7629 SoC. +- mediatek,sgmiisys: a list of phandles to the syscon node that handles the + SGMII setup which is required for those SoCs equipped with SGMII such + as MT7622 and MT7629 SoC. And MT7622 have only one set of SGMII shared + by GMAC1 and GMAC2; MT7629 have two independent sets of SGMII directed + to GMAC1 and GMAC2, respectively. - mediatek,pctl: phandle to the syscon node that handles the ports slew rate and driver current: only for MT2701 and MT7623 SoC -- cgit From af9422a85721f7afa8d5ad3442b5de5549a23e84 Mon Sep 17 00:00:00 2001 From: Gareth Williams Date: Tue, 28 May 2019 12:54:26 +0100 Subject: dt-bindings: clock: renesas: r9a06g032-sysctrl: Document power Domains The driver is gaining power domain support, so add the new property to the DT binding and update the examples. Signed-off-by: Gareth Williams Signed-off-by: Geert Uytterhoeven --- .../devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt index d60b99756bb9..aed713cf0831 100644 --- a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt +++ b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt @@ -13,6 +13,7 @@ Required Properties: - external (optional) RGMII_REFCLK - clock-names: Must be: clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext"; + - #power-domain-cells: Must be 0 Examples -------- @@ -27,6 +28,7 @@ Examples clocks = <&ext_mclk>, <&ext_rtc_clk>, <&ext_jtag_clk>, <&ext_rgmii_ref>; clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext"; + #power-domain-cells = <0>; }; - Other nodes can use the clocks provided by SYSCTRL as in: @@ -38,6 +40,7 @@ Examples interrupts = ; reg-shift = <2>; reg-io-width = <4>; - clocks = <&sysctrl R9A06G032_CLK_UART0>; - clock-names = "baudclk"; + clocks = <&sysctrl R9A06G032_CLK_UART0>, <&sysctrl R9A06G032_HCLK_UART0>; + clock-names = "baudclk", "apb_pclk"; + power-domains = <&sysctrl>; }; -- cgit From f48d14c218ccaaf7c22f84582d4bdb5fa2234c76 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 27 May 2019 22:14:53 +0200 Subject: dt-bindings: arm64: allwinner: h6: Add binding for DMA controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DMA in H6 is similar to other DMA controller, except it is first which supports more than 32 request sources and has 16 channels. It also needs additional clock to be enabled. Signed-off-by: Jernej Skrabec Reviewed-by: Rob Herring Signed-off-by: Clément Péron Acked-by: Maxime Ripard Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/sun6i-dma.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt index 7fccc20d8331..cae31f4e77ba 100644 --- a/Documentation/devicetree/bindings/dma/sun6i-dma.txt +++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt @@ -28,12 +28,17 @@ Example: }; ------------------------------------------------------------------------------ -For A64 DMA controller: +For A64 and H6 DMA controller: Required properties: -- compatible: "allwinner,sun50i-a64-dma" +- compatible: Must be one of + "allwinner,sun50i-a64-dma" + "allwinner,sun50i-h6-dma" - dma-channels: Number of DMA channels supported by the controller. Refer to Documentation/devicetree/bindings/dma/dma.txt +- clocks: In addition to parent AHB clock, it should also contain mbus + clock (H6 only) +- clock-names: Should contain "bus" and "mbus" (H6 only) - all properties above, i.e. reg, interrupts, clocks, resets and #dma-cells Optional properties: -- cgit From 18e1572419d69f8d45248cccabc40352a3e281d6 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 4 Jun 2019 07:55:49 -0600 Subject: docs: Completely fix the remote build tree case My previous fix miserably failed to catch all of the invocations of "./scripts/sphinx-pre-install", so we got build errors. Try again with more caffeine. Reported-by: kbuild test robot Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/Makefile b/Documentation/Makefile index 2edd03b1dad6..2df0789f90b7 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -72,14 +72,14 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) $(abspath $(BUILDDIR)/$3/$4) htmldocs: - @./scripts/sphinx-pre-install --version-check + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) linkcheckdocs: @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) latexdocs: - @./scripts/sphinx-pre-install --version-check + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) ifeq ($(HAVE_PDFLATEX),0) @@ -91,17 +91,17 @@ pdfdocs: else # HAVE_PDFLATEX pdfdocs: latexdocs - @./scripts/sphinx-pre-install --version-check + @$(srctree)/scripts/sphinx-pre-install --version-check $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) endif # HAVE_PDFLATEX epubdocs: - @./scripts/sphinx-pre-install --version-check + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) xmldocs: - @./scripts/sphinx-pre-install --version-check + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) endif # HAVE_SPHINX -- cgit From d6e65bb7ff0d360c4c5462c3d0b237f2a07e5312 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Tue, 28 May 2019 12:30:17 +0800 Subject: ASoC: rt1011: Add RT1011 amplifier driver This is the initial amplifier driver for rt1011. Signed-off-by: Shuming Fan Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/rt1011.txt | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/rt1011.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/rt1011.txt b/Documentation/devicetree/bindings/sound/rt1011.txt new file mode 100644 index 000000000000..35a23e60d679 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/rt1011.txt @@ -0,0 +1,32 @@ +RT1011 Mono Class D Audio Amplifier + +This device supports I2C only. + +Required properties: + +- compatible : "realtek,rt1011". + +- reg : The I2C address of the device. This I2C address decide by + two input pins (ASEL1 and ASEL2). + ------------------------------------- + | ASEL2 | ASEL1 | Address | + ------------------------------------- + | 0 | 0 | 0x38 | + ------------------------------------- + | 0 | 1 | 0x39 | + ------------------------------------- + | 1 | 0 | 0x3a | + ------------------------------------- + | 1 | 1 | 0x3b | + ------------------------------------- + +Pins on the device (for linking into audio routes) for RT1011: + + * SPO + +Example: + +rt1011: codec@38 { + compatible = "realtek,rt1011"; + reg = <0x38>; +}; -- cgit From 6d72a49ff3f6bb0932718c132b75351aedc9458e Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 4 Jun 2019 14:12:56 +0900 Subject: spi: Add DT bindings for Synquacer This patch adds documentation for Device-Tree bindings for the Socionext Synquacer spi driver. Signed-off-by: Masahisa Kojima Signed-off-by: Jassi Brar Reviewed-by: Rob Herring Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/spi-synquacer.txt | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 Documentation/devicetree/bindings/spi/spi-synquacer.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/spi/spi-synquacer.txt b/Documentation/devicetree/bindings/spi/spi-synquacer.txt new file mode 100644 index 000000000000..291dfa692d0a --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-synquacer.txt @@ -0,0 +1,27 @@ +* Socionext Synquacer HS-SPI bindings + +Required Properties: +- compatible: should be "socionext,synquacer-spi" +- reg: physical base address of the controller and length of memory mapped + region. +- interrupts: should contain the "spi_rx", "spi_tx" and "spi_fault" interrupts. +- clocks: core clock iHCLK. Optional rate clock iPCLK (default is iHCLK) +- clock-names: Shall be "iHCLK" and "iPCLK" respectively + +Optional Properties: +- socionext,use-rtm: boolean, if required to use "retimed clock" for RX +- socionext,set-aces: boolean, if same active clock edges field to be set. + +Example: + + spi0: spi@ff110000 { + compatible = "socionext,synquacer-spi"; + reg = <0xff110000 0x1000>; + interrupts = , + , + ; + clocks = <&clk_hsspi>; + clock-names = "iHCLK"; + socionext,use-rtm; + socionext,set-aces; + }; -- cgit From be1038846b8063c448baf5ddcdc2387241c4133e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 4 Jun 2019 11:17:47 -0300 Subject: docs: soundwire: locking: fix tags for a code-block There's an ascii artwork at Example 1 whose code-block is not properly idented, causing those warnings. Documentation/driver-api/soundwire/locking.rst:50: WARNING: Inconsistent literal block quoting. Documentation/driver-api/soundwire/locking.rst:51: WARNING: Line block ends without a blank line. Documentation/driver-api/soundwire/locking.rst:55: WARNING: Inline substitution_reference start-string without end-string. Documentation/driver-api/soundwire/locking.rst:56: WARNING: Line block ends without a blank line. Signed-off-by: Mauro Carvalho Chehab Acked-by: Pierre-Louis Bossart Signed-off-by: Vinod Koul --- Documentation/driver-api/soundwire/locking.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/soundwire/locking.rst b/Documentation/driver-api/soundwire/locking.rst index 253f73555255..3a7ffb3d87f3 100644 --- a/Documentation/driver-api/soundwire/locking.rst +++ b/Documentation/driver-api/soundwire/locking.rst @@ -44,7 +44,9 @@ Message transfer. b. Transfer message (Read/Write) to Slave1 or broadcast message on Bus in case of bank switch. - c. Release Message lock :: + c. Release Message lock + + :: +----------+ +---------+ | | | | -- cgit From 8ffdaa7f491961efb4d02d3c8d806e9a60105adb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 May 2019 14:06:52 +0800 Subject: KVM: Documentation: Add disable pause exits to KVM_CAP_X86_DISABLE_EXITS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b31c114b (KVM: X86: Provide a capability to disable PAUSE intercepts) forgot to add the KVM_X86_DISABLE_EXITS_PAUSE into api doc. This patch adds it. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ba6c42c576dd..33cd92dd6aa5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4893,6 +4893,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) +#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some -- cgit From b51700632e0e53254733ff706e5bdca22d19dbe5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 May 2019 14:06:53 +0800 Subject: KVM: X86: Provide a capability to disable cstate msr read intercepts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow guest reads CORE cstate when exposing host CPU power management capabilities to the guest. PKG cstate is restricted to avoid a guest to get the whole package information in multi-tenant scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 33cd92dd6aa5..91fd86fcc49f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4894,6 +4894,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some -- cgit From 87b11e0638c3dbf029a7c9020f8a779062db58fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Jun 2019 15:17:00 -0700 Subject: net/tls: remove false positive warning It's possible that TCP stack will decide to retransmit a packet right when that packet's data gets acked, especially in presence of packet reordering. This means that packets may be in flight, even though tls_device code has already freed their record state. Make fill_sg_in() and in turn tls_sw_fallback() not generate a warning in that case, and quietly proceed to drop such frames. Make the exit path from tls_sw_fallback() drop monitor friendly, for users to be able to troubleshoot dropped retransmissions. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index cb85af559dff..eb7c9b81ccf5 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -379,7 +379,6 @@ by the driver: but did not arrive in the expected order * ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because they arrived out of order and associated record could not be found - (see also :ref:`pre_tls_data`) Notable corner cases, exceptions and additional requirements ============================================================ @@ -462,21 +461,3 @@ Redirects leak clear text In the RX direction, if segment has already been decrypted by the device and it gets redirected or mirrored - clear text will be transmitted out. - -.. _pre_tls_data: - -Transmission of pre-TLS data ----------------------------- - -User can enqueue some already encrypted and framed records before enabling -``ktls`` on the socket. Those records have to get sent as they are. This is -perfectly easy to handle in the software case - such data will be waiting -in the TCP layer, TLS ULP won't see it. In the offloaded case when pre-queued -segment reaches transmission point it appears to be out of order (before the -expected TCP sequence number) and the stack does not have a record information -associated. - -All segments without record information cannot, however, be assumed to be -pre-queued data, because a race condition exists between TCP stack queuing -a retransmission, the driver seeing the retransmission and TCP ACK arriving -for the retransmitted data. -- cgit From dabde0dac12481c055d2d0ef6686c5789a1a3499 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 4 Jun 2019 07:34:33 +0000 Subject: dt-bindings: net: dsa: marvell: add "marvell,mv88e6250" compatible string The mv88e6250 has port_base_addr 0x8 or 0x18 (depending on configuration pins), so it constitutes a new family and hence needs its own compatible string. Reviewed-by: Andrew Lunn Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/marvell.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt index feb007af13cb..6f9538974bb9 100644 --- a/Documentation/devicetree/bindings/net/dsa/marvell.txt +++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt @@ -21,10 +21,13 @@ which is at a different MDIO base address in different switch families. 6341, 6350, 6351, 6352 - "marvell,mv88e6190" : Switch has base address 0x00. Use with models: 6190, 6190X, 6191, 6290, 6390, 6390X +- "marvell,mv88e6250" : Switch has base address 0x08 or 0x18. Use with model: + 6250 Required properties: -- compatible : Should be one of "marvell,mv88e6085" or - "marvell,mv88e6190" as indicated above +- compatible : Should be one of "marvell,mv88e6085", + "marvell,mv88e6190" or "marvell,mv88e6250" as + indicated above - reg : Address on the MII bus for the switch. Optional properties: -- cgit From bcc8737ddcaad43acbb9ccf768f1be9c61bd493f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 4 Jun 2019 11:17:42 -0300 Subject: Documentation/i915: Fix references to renamed files WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -function Hardware workarounds ./drivers/gpu/drm/i915/intel_workarounds.c' failed with return code 1 WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -function Logical Rings, Logical Ring Contexts and Execlists ./drivers/gpu/drm/i915/intel_lrc.c' failed with return code 1 WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -internal ./drivers/gpu/drm/i915/intel_lrc.c' failed with return code 2 Fixes: 112ed2d31a46 ("drm/i915: Move GraphicsTechnology files under gt/") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/bd7dd29b9fb2101c954c8cfb2c3b4efc7d277045.1559656538.git.mchehab+samsung@kernel.org --- Documentation/gpu/i915.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst index 055df45596c1..6c75380b2928 100644 --- a/Documentation/gpu/i915.rst +++ b/Documentation/gpu/i915.rst @@ -61,7 +61,7 @@ Intel GVT-g Host Support(vGPU device model) Workarounds ----------- -.. kernel-doc:: drivers/gpu/drm/i915/intel_workarounds.c +.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_workarounds.c :doc: Hardware workarounds Display Hardware Handling @@ -379,10 +379,10 @@ User Batchbuffer Execution Logical Rings, Logical Ring Contexts and Execlists -------------------------------------------------- -.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c +.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c :doc: Logical Rings, Logical Ring Contexts and Execlists -.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c +.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c :internal: Global GTT views -- cgit From 355fb0e54e85316fedd2f688fc62917cc4a5bb81 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Tue, 4 Jun 2019 19:21:53 +0200 Subject: dt-bindings: input: sun4i-lradc-keys: Add A64 compatible Add the A64 compatible with a fallback to the A83T compatible. Signed-off-by: Luca Weiss Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt index 496125c6bfb7..507b737612ea 100644 --- a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt +++ b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt @@ -5,6 +5,7 @@ Required properties: - compatible: should be one of the following string: "allwinner,sun4i-a10-lradc-keys" "allwinner,sun8i-a83t-r-lradc" + "allwinner,sun50i-a64-lradc", "allwinner,sun8i-a83t-r-lradc" - reg: mmio address range of the chip - interrupts: interrupt to which the chip is connected - vref-supply: powersupply for the lradc reference voltage -- cgit From 0d9ce162cf46c99628cc5da9510b959c7976735b Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 3 Jan 2019 17:14:28 -0800 Subject: kvm: Convert kvm_lock to a mutex It doesn't seem as if there is any particular need for kvm_lock to be a spinlock, so convert the lock to a mutex so that sleepable functions (in particular cond_resched()) can be called while holding it. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 1bb8bcaf8497..635cd6eaf714 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows: On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. -For spinlocks, kvm_lock is taken outside kvm->mmu_lock. - Everything else is a leaf: no other lock is taken inside the critical sections. @@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above. ------------ Name: kvm_lock -Type: spinlock_t +Type: mutex Arch: any Protects: - vm_list -- cgit From b467ec063ec56900e1ebba4d5aeb50b0a7cb0ef8 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 27 May 2019 12:02:19 +0200 Subject: dt-bindings: clk: Convert Allwinner CCU to a schema The Allwinner SoCs have a clocks controller supported in Linux, with a matching Device Tree binding. Now that we have the DT validation in place, let's convert the device tree bindings for that controller over to a YAML schemas. Signed-off-by: Maxime Ripard --- .../bindings/clock/allwinner,sun4i-a10-ccu.yaml | 141 +++++++++++++++++++++ .../devicetree/bindings/clock/sunxi-ccu.txt | 62 --------- 2 files changed, 141 insertions(+), 62 deletions(-) create mode 100644 Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml delete mode 100644 Documentation/devicetree/bindings/clock/sunxi-ccu.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml new file mode 100644 index 000000000000..c935405458fe --- /dev/null +++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/phy/allwinner,sun4i-a10-ccu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner Clock Control Unit Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#clock-cells": + const: 1 + + "#reset-cells": + const: 1 + + compatible: + enum: + - allwinner,sun4i-a10-ccu + - allwinner,sun5i-a10s-ccu + - allwinner,sun5i-a13-ccu + - allwinner,sun6i-a31-ccu + - allwinner,sun7i-a20-ccu + - allwinner,sun8i-a23-ccu + - allwinner,sun8i-a33-ccu + - allwinner,sun8i-a83t-ccu + - allwinner,sun8i-a83t-r-ccu + - allwinner,sun8i-h3-ccu + - allwinner,sun8i-h3-r-ccu + - allwinner,sun8i-r40-ccu + - allwinner,sun8i-v3s-ccu + - allwinner,sun9i-a80-ccu + - allwinner,sun50i-a64-ccu + - allwinner,sun50i-a64-r-ccu + - allwinner,sun50i-h5-ccu + - allwinner,sun50i-h6-ccu + - allwinner,sun50i-h6-r-ccu + - allwinner,suniv-f1c100s-ccu + - nextthing,gr8-ccu + + reg: + maxItems: 1 + + clocks: + minItems: 2 + maxItems: 4 + items: + - description: High Frequency Oscillator (usually at 24MHz) + - description: Low Frequency Oscillator (usually at 32kHz) + - description: Internal Oscillator + - description: Peripherals PLL + + clock-names: + minItems: 2 + maxItems: 4 + items: + - const: hosc + - const: losc + - const: iosc + - const: pll-periph + +required: + - "#clock-cells" + - "#reset-cells" + - compatible + - reg + - clocks + - clock-names + +if: + properties: + compatible: + enum: + - allwinner,sun8i-a83t-r-ccu + - allwinner,sun8i-h3-r-ccu + - allwinner,sun50i-a64-r-ccu + - allwinner,sun50i-h6-r-ccu + +then: + properties: + clocks: + minItems: 4 + maxItems: 4 + + clock-names: + minItems: 4 + maxItems: 4 + +else: + if: + properties: + compatible: + const: allwinner,sun50i-h6-ccu + + then: + properties: + clocks: + minItems: 3 + maxItems: 3 + + clock-names: + minItems: 3 + maxItems: 3 + + else: + properties: + clocks: + minItems: 2 + maxItems: 2 + + clock-names: + minItems: 2 + maxItems: 2 + +additionalProperties: false + +examples: + - | + ccu: clock@1c20000 { + compatible = "allwinner,sun8i-h3-ccu"; + reg = <0x01c20000 0x400>; + clocks = <&osc24M>, <&osc32k>; + clock-names = "hosc", "losc"; + #clock-cells = <1>; + #reset-cells = <1>; + }; + + - | + r_ccu: clock@1f01400 { + compatible = "allwinner,sun50i-a64-r-ccu"; + reg = <0x01f01400 0x100>; + clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu 11>; + clock-names = "hosc", "losc", "iosc", "pll-periph"; + #clock-cells = <1>; + #reset-cells = <1>; + }; + +... diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt deleted file mode 100644 index e3bd88ae456b..000000000000 --- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt +++ /dev/null @@ -1,62 +0,0 @@ -Allwinner Clock Control Unit Binding ------------------------------------- - -Required properties : -- compatible: must contain one of the following compatibles: - - "allwinner,sun4i-a10-ccu" - - "allwinner,sun5i-a10s-ccu" - - "allwinner,sun5i-a13-ccu" - - "allwinner,sun6i-a31-ccu" - - "allwinner,sun7i-a20-ccu" - - "allwinner,sun8i-a23-ccu" - - "allwinner,sun8i-a33-ccu" - - "allwinner,sun8i-a83t-ccu" - - "allwinner,sun8i-a83t-r-ccu" - - "allwinner,sun8i-h3-ccu" - - "allwinner,sun8i-h3-r-ccu" -+ - "allwinner,sun8i-r40-ccu" - - "allwinner,sun8i-v3s-ccu" - - "allwinner,sun9i-a80-ccu" - - "allwinner,sun50i-a64-ccu" - - "allwinner,sun50i-a64-r-ccu" - - "allwinner,sun50i-h5-ccu" - - "allwinner,sun50i-h6-ccu" - - "allwinner,sun50i-h6-r-ccu" - - "allwinner,suniv-f1c100s-ccu" - - "nextthing,gr8-ccu" - -- reg: Must contain the registers base address and length -- clocks: phandle to the oscillators feeding the CCU. Two are needed: - - "hosc": the high frequency oscillator (usually at 24MHz) - - "losc": the low frequency oscillator (usually at 32kHz) - On the A83T, this is the internal 16MHz oscillator divided by 512 -- clock-names: Must contain the clock names described just above -- #clock-cells : must contain 1 -- #reset-cells : must contain 1 - -For the main CCU on H6, one more clock is needed: -- "iosc": the SoC's internal frequency oscillator - -For the PRCM CCUs on A83T/H3/A64/H6, two more clocks are needed: -- "pll-periph": the SoC's peripheral PLL from the main CCU -- "iosc": the SoC's internal frequency oscillator - -Example for generic CCU: -ccu: clock@1c20000 { - compatible = "allwinner,sun8i-h3-ccu"; - reg = <0x01c20000 0x400>; - clocks = <&osc24M>, <&osc32k>; - clock-names = "hosc", "losc"; - #clock-cells = <1>; - #reset-cells = <1>; -}; - -Example for PRCM CCU: -r_ccu: clock@1f01400 { - compatible = "allwinner,sun50i-a64-r-ccu"; - reg = <0x01f01400 0x100>; - clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu CLK_PLL_PERIPH0>; - clock-names = "hosc", "losc", "iosc", "pll-periph"; - #clock-cells = <1>; - #reset-cells = <1>; -}; -- cgit From 79e3f1d3db3d99ac7ae4f29b00da545df231a5a7 Mon Sep 17 00:00:00 2001 From: Raul E Rangel Date: Mon, 3 Jun 2019 12:16:49 -0600 Subject: platform/chrome: wilco_ec: Add version sysfs entries Add the ability to extract version information from the EC. Example Output: $ cd /sys/bus/platform/devices/GOOG000C:00 $ tail build_date build_revision version model_number ==> build_date <== 04/25/19 ==> build_revision <== d2592cae0 ==> version <== 00.00.14 ==> model_number <== 08B6 Signed-off-by: Raul E Rangel Reviewed-by: Nick Crews Signed-off-by: Enric Balletbo i Serra --- Documentation/ABI/testing/sysfs-platform-wilco-ec | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-platform-wilco-ec b/Documentation/ABI/testing/sysfs-platform-wilco-ec index 8e5d6eee44db..8827a734f933 100644 --- a/Documentation/ABI/testing/sysfs-platform-wilco-ec +++ b/Documentation/ABI/testing/sysfs-platform-wilco-ec @@ -7,3 +7,34 @@ Description: want to run their device headless or with a dock. Input should be parseable by kstrtou8() to 0 or 1. + +What: /sys/bus/platform/devices/GOOG000C\:00/build_date +Date: May 2019 +KernelVersion: 5.3 +Description: + Display Wilco Embedded Controller firmware build date. + Output will a MM/DD/YY string. + +What: /sys/bus/platform/devices/GOOG000C\:00/build_revision +Date: May 2019 +KernelVersion: 5.3 +Description: + Display Wilco Embedded Controller build revision. + Output will a version string be similar to the example below: + d2592cae0 + +What: /sys/bus/platform/devices/GOOG000C\:00/model_number +Date: May 2019 +KernelVersion: 5.3 +Description: + Display Wilco Embedded Controller model number. + Output will a version string be similar to the example below: + 08B6 + +What: /sys/bus/platform/devices/GOOG000C\:00/version +Date: May 2019 +KernelVersion: 5.3 +Description: + Display Wilco Embedded Controller firmware version. + The format of the string is x.y.z. Where x is major, y is minor + and z is the build number. For example: 95.00.06 -- cgit From e96a8819a6c4fc578809ba79d64abca57145acb7 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Wed, 5 Jun 2019 07:59:10 -0400 Subject: media: docs: fix minor typos Fix minor typos in the DVB demux documentation. Signed-off-by: Marc Gonzalez Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/kapi/dtv-core.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/media/kapi/dtv-core.rst b/Documentation/media/kapi/dtv-core.rst index ac005b46f23e..82c5b85ed9b1 100644 --- a/Documentation/media/kapi/dtv-core.rst +++ b/Documentation/media/kapi/dtv-core.rst @@ -11,12 +11,12 @@ Digital TV devices are implemented by several different drivers: - Frontend drivers that are usually implemented as two separate drivers: - - A tuner driver that implements the logic with commands the part of the - hardware with is responsible to tune into a digital TV transponder or + - A tuner driver that implements the logic which commands the part of + the hardware responsible for tuning into a digital TV transponder or physical channel. The output of a tuner is usually a baseband or Intermediate Frequency (IF) signal; - - A demodulator driver (a.k.a "demod") that implements the logic with + - A demodulator driver (a.k.a "demod") that implements the logic which commands the digital TV decoding hardware. The output of a demod is a digital stream, with multiple audio, video and data channels typically multiplexed using MPEG Transport Stream [#f1]_. -- cgit From 84060c65a8fa14ea9974a8f15b8280517b869cdd Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Thu, 30 May 2019 05:33:12 -0400 Subject: media: media/doc: Allow sizeimage to be set by v4l clients This changes v4l2_pix_format and v4l2_plane_pix_format sizeimage field description to allow v4l clients to set bigger image size in case of variable length compressed data. Presently s5p-mfc and mtk-vcodec codec drivers use that. Lets make it obvious in the documentation. Signed-off-by: Stanimir Varbanov Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst | 15 ++++++++++++++- Documentation/media/uapi/v4l/pixfmt-v4l2.rst | 13 ++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst index 5688c816e334..db43dda5aafb 100644 --- a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst +++ b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst @@ -31,7 +31,20 @@ describing all planes of that format. * - __u32 - ``sizeimage`` - - Maximum size in bytes required for image data in this plane. + - Maximum size in bytes required for image data in this plane, + set by the driver. When the image consists of variable length + compressed data this is the number of bytes required by the + codec to support the worst-case compression scenario. + + The driver will set the value for uncompressed images. + + Clients are allowed to set the sizeimage field for variable length + compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at + :ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the + value itself, or it may modify the provided value based on + alignment requirements or minimum/maximum size requirements. + If the client wants to leave this to the driver, then it should + set sizeimage to 0. * - __u32 - ``bytesperline`` - Distance in bytes between the leftmost pixels in two adjacent diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst index 71eebfc6d853..da6da2ef139a 100644 --- a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst +++ b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst @@ -89,7 +89,18 @@ Single-planar format structure - Size in bytes of the buffer to hold a complete image, set by the driver. Usually this is ``bytesperline`` times ``height``. When the image consists of variable length compressed data this is the - maximum number of bytes required to hold an image. + number of bytes required by the codec to support the worst-case + compression scenario. + + The driver will set the value for uncompressed images. + + Clients are allowed to set the sizeimage field for variable length + compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at + :ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the + value itself, or it may modify the provided value based on + alignment requirements or minimum/maximum size requirements. + If the client wants to leave this to the driver, then it should + set sizeimage to 0. * - __u32 - ``colorspace`` - Image colorspace, from enum :c:type:`v4l2_colorspace`. -- cgit From 6aace2f89f785482efecbec3d61dfa41e0a64f9c Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 5 Jun 2019 10:31:06 -0400 Subject: media: Documentation: update email address Use hverkuil-cisco@xs4all.nl instead of hans.verkuil@cisco.com. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/ABI/testing/debugfs-cec-error-inj | 2 +- Documentation/media/uapi/cec/cec-api.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/debugfs-cec-error-inj b/Documentation/ABI/testing/debugfs-cec-error-inj index 122b65c5fe62..4c3596c6d25b 100644 --- a/Documentation/ABI/testing/debugfs-cec-error-inj +++ b/Documentation/ABI/testing/debugfs-cec-error-inj @@ -1,6 +1,6 @@ What: /sys/kernel/debug/cec/*/error-inj Date: March 2018 -Contact: Hans Verkuil +Contact: Hans Verkuil Description: The CEC Framework allows for CEC error injection commands through diff --git a/Documentation/media/uapi/cec/cec-api.rst b/Documentation/media/uapi/cec/cec-api.rst index b614bf81aa20..0780ba07995a 100644 --- a/Documentation/media/uapi/cec/cec-api.rst +++ b/Documentation/media/uapi/cec/cec-api.rst @@ -39,7 +39,7 @@ Revision and Copyright ********************** Authors: -- Verkuil, Hans +- Verkuil, Hans - Initial version. -- cgit From 323a53c41292a0d7efc8748856c623324c8d7c21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Jun 2019 07:55:09 -0700 Subject: ipv6: tcp: enable flowlabel reflection in some RST packets When RST packets are sent because no socket could be found, it makes sense to use flowlabel_reflect sysctl to decide if a reflection of the flowlabel is requested. This extends commit 22b6722bfa59 ("ipv6: Add sysctl for per namespace flow label reflection"), for some TCP RST packets. In order to provide full control of this new feature, flowlabel_reflect becomes a bitmask. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a73b3a02e49a..f4b1043e92ed 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1429,14 +1429,24 @@ flowlabel_state_ranges - BOOLEAN FALSE: disabled Default: true -flowlabel_reflect - BOOLEAN - Automatically reflect the flow label. Needed for Path MTU +flowlabel_reflect - INTEGER + Control flow label reflection. Needed for Path MTU Discovery to work with Equal Cost Multipath Routing in anycast environments. See RFC 7690 and: https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 - TRUE: enabled - FALSE: disabled - Default: FALSE + + This is a mask of two bits. + 1: enabled for established flows + + Note that this prevents automatic flowlabel changes, as done + in "tcp: change IPv6 flow-label upon receiving spurious retransmission" + and "tcp: Change txhash on every SYN and RTO retransmit" + + 2: enabled for TCP RESET packets (no active listener) + If set, a RST packet sent in response to a SYN packet on a closed + port will reflect the incoming flow label. + + Default: 0 fib_multipath_hash_policy - INTEGER Controls which hash policy to use for multipath routes. -- cgit From 8a6f43d4d92d730dbe20fb84b72563be87d61446 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 5 Jun 2019 12:56:56 +0300 Subject: Documentation/i915: Fix kernel-doc references to moved gem files The error messages could be more descriptive, but fix these caused by file moves: WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -internal ./drivers/gpu/drm/i915/i915_gem_shrinker.c' failed with return code 2 WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -function User command execution ./drivers/gpu/drm/i915/i915_gem_execbuffer.c' failed with return code 1 WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -internal ./drivers/gpu/drm/i915/i915_gem_tiling.c' failed with return code 2 WARNING: kernel-doc './scripts/kernel-doc -rst -enable-lineno -function buffer object tiling ./drivers/gpu/drm/i915/i915_gem_tiling.c' failed with return code 1 Fixes: 10be98a77c55 ("drm/i915: Move more GEM objects under gem/") Cc: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20190605095657.23601-1-jani.nikula@intel.com --- Documentation/gpu/i915.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst index 6c75380b2928..f98ee95da90f 100644 --- a/Documentation/gpu/i915.rst +++ b/Documentation/gpu/i915.rst @@ -349,7 +349,7 @@ of buffer object caches. Shrinking is used to make main memory available. Note that this is mostly orthogonal to evicting buffer objects, which has the goal to make space in gpu virtual address spaces. -.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_shrinker.c +.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_shrinker.c :internal: Batchbuffer Parsing @@ -373,7 +373,7 @@ Batchbuffer Pools User Batchbuffer Execution -------------------------- -.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_execbuffer.c +.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c :doc: User command execution Logical Rings, Logical Ring Contexts and Execlists @@ -382,9 +382,6 @@ Logical Rings, Logical Ring Contexts and Execlists .. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c :doc: Logical Rings, Logical Ring Contexts and Execlists -.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c - :internal: - Global GTT views ---------------- @@ -415,10 +412,10 @@ Hardware Tiling and Swizzling Details Object Tiling IOCTLs -------------------- -.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c +.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c :internal: -.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c +.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c :doc: buffer object tiling WOPCM -- cgit From affa22b5f0f7e9caf61887671abe38819737bf16 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 5 Jun 2019 12:56:57 +0300 Subject: drm/i915: fix documentation build warnings Just a straightforward bag of fixes for a clean htmldocs build. Reviewed-by: Mika Kuoppala Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20190605095657.23601-2-jani.nikula@intel.com --- Documentation/gpu/i915.rst | 6 ------ 1 file changed, 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst index f98ee95da90f..300220c4b498 100644 --- a/Documentation/gpu/i915.rst +++ b/Documentation/gpu/i915.rst @@ -475,12 +475,6 @@ i915_context_create and i915_context_free .. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h :doc: i915_context_create and i915_context_free tracepoints -switch_mm ---------- - -.. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h - :doc: switch_mm tracepoint - Perf ==== -- cgit From 9d79b2f1aed4909988ecd8b370c7919856639699 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Fri, 17 May 2019 10:26:28 +0100 Subject: dt-bindings: Add vendor prefix for HopeRun Add "Jiangsu HopeRun Software Co., Ltd." to the list of devicetree vendor prefixes as "hoperun". Website: http://www.hoperun.com/en Signed-off-by: Fabrizio Castro Reviewed-by: Chris Paterson Reviewed-by: Rob Herring Signed-off-by: Simon Horman --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 33a65a45e319..83ca4816a78b 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -371,6 +371,8 @@ patternProperties: description: Holt Integrated Circuits, Inc. "^honeywell,.*": description: Honeywell + "^hoperun,.*": + description: Jiangsu HopeRun Software Co., Ltd. "^hp,.*": description: Hewlett Packard "^holtek,.*": -- cgit From 7bdcb8e0454af9f1b3cd16239ae87c9d00f7a134 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 May 2019 22:30:31 +0200 Subject: dt-bindings: rtc: Add YAML schemas for the generic RTC bindings The real time clocks have a bunch of generic properties that are needed in a device tree. Add a YAML schemas for those. Signed-off-by: Maxime Ripard Signed-off-by: Alexandre Belloni --- Documentation/devicetree/bindings/rtc/rtc.txt | 34 +----------------- Documentation/devicetree/bindings/rtc/rtc.yaml | 50 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 33 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/rtc.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/rtc/rtc.txt b/Documentation/devicetree/bindings/rtc/rtc.txt index a97fc6a9a75e..4d6f81dccc68 100644 --- a/Documentation/devicetree/bindings/rtc/rtc.txt +++ b/Documentation/devicetree/bindings/rtc/rtc.txt @@ -1,36 +1,4 @@ -Generic device tree bindings for Real Time Clock devices -======================================================== - -This document describes generic bindings which can be used to describe Real Time -Clock devices in a device tree. - -Required properties -------------------- - -- compatible : name of RTC device following generic names recommended practice. - -For other required properties e.g. to describe register sets, -clocks, etc. check the binding documentation of the specific driver. - -Optional properties -------------------- - -- start-year : if provided, the default hardware range supported by the RTC is - shifted so the first usable year is the specified one. - -The following properties may not be supported by all drivers. However, if a -driver wants to support one of the below features, it should adapt the bindings -below. -- trickle-resistor-ohms : Selected resistor for trickle charger. Should be given - if trickle charger should be enabled -- trickle-diode-disable : Do not use internal trickle charger diode Should be - given if internal trickle charger diode should be - disabled -- wakeup-source : Enables wake up of host system on alarm -- quartz-load-femtofarads : The capacitive load of the quartz(x-tal), - expressed in femto Farad (fF). - The default value shall be listed (if optional), - and likewise all valid values. +This file has been moved to rtc.yaml. Trivial RTCs ------------ diff --git a/Documentation/devicetree/bindings/rtc/rtc.yaml b/Documentation/devicetree/bindings/rtc/rtc.yaml new file mode 100644 index 000000000000..ee237b2ed66a --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc.yaml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: RTC Generic Binding + +maintainers: + - Alexandre Belloni + +description: | + This document describes generic bindings which can be used to + describe Real Time Clock devices in a device tree. + +properties: + $nodename: + pattern: "^rtc(@.*|-[0-9a-f])*$" + + quartz-load-femtofarads: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + The capacitive load of the quartz(x-tal), expressed in femto + Farad (fF). The default value shall be listed (if optional), + and likewise all valid values. + + start-year: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + If provided, the default hardware range supported by the RTC is + shifted so the first usable year is the specified one. + + trickle-diode-disable: + $ref: /schemas/types.yaml#/definitions/flag + description: + Do not use internal trickle charger diode. Should be given if + internal trickle charger diode should be disabled. + + trickle-resistor-ohms: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Selected resistor for trickle charger. Should be given + if trickle charger should be enabled. + + wakeup-source: + $ref: /schemas/types.yaml#/definitions/flag + description: + Enables wake up of host system on alarm. + +... -- cgit From 43390e0710fdfe4bd7793570c45d9092ed93da1b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 May 2019 22:30:32 +0200 Subject: dt-bindings: rtc: Move trivial RTC over to a schemas of their own The RTC generic bindings has a bunch of devices that have a pretty simple binding, with just compatible, reg and optional interrupts properties. This is exactly what the trivial devices YAML schema has been created for, except that they can also have the start-year property, but not any other generic RTC property. Let's create a schema with those constraints. Signed-off-by: Maxime Ripard Signed-off-by: Alexandre Belloni --- Documentation/devicetree/bindings/rtc/rtc.txt | 39 --------- .../devicetree/bindings/rtc/trivial-rtc.yaml | 92 ++++++++++++++++++++++ 2 files changed, 92 insertions(+), 39 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/trivial-rtc.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/rtc/rtc.txt b/Documentation/devicetree/bindings/rtc/rtc.txt index 4d6f81dccc68..b8d36fce5e2d 100644 --- a/Documentation/devicetree/bindings/rtc/rtc.txt +++ b/Documentation/devicetree/bindings/rtc/rtc.txt @@ -1,40 +1 @@ This file has been moved to rtc.yaml. - -Trivial RTCs ------------- - -This is a list of trivial RTC devices that have simple device tree -bindings, consisting only of a compatible field, an address and -possibly an interrupt line. - - -Compatible Vendor / Chip -========== ============= -abracon,abb5zes3 AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface -abracon,abeoz9 AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface -dallas,ds1374 I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output -dallas,ds1672 Dallas DS1672 Real-time Clock -dallas,ds3232 Extremely Accurate I²C RTC with Integrated Crystal and SRAM -epson,rx8010 I2C-BUS INTERFACE REAL TIME CLOCK MODULE -epson,rx8571 I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM -epson,rx8581 I2C-BUS INTERFACE REAL TIME CLOCK MODULE -emmicro,em3027 EM Microelectronic EM3027 Real-time Clock -isil,isl1208 Intersil ISL1208 Low Power RTC with Battery Backed SRAM -isil,isl1218 Intersil ISL1218 Low Power RTC with Battery Backed SRAM -isil,isl12022 Intersil ISL12022 Real-time Clock -microcrystal,rv3028 Real Time Clock Module with I2C-Bus -microcrystal,rv3029 Real Time Clock Module with I2C-Bus -microcrystal,rv8523 Real Time Clock -nxp,pcf2127 Real-time clock -nxp,pcf2129 Real-time clock -nxp,pcf8563 Real-time clock/calendar -pericom,pt7c4338 Real-time Clock Module -ricoh,r2025sd I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,r2221tl I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rs5c372a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rs5c372b I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rv5c386 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rv5c387a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -sii,s35390a 2-wire CMOS real-time clock -whwave,sd3078 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -xircom,x1205 Xircom X1205 I2C RTC diff --git a/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml new file mode 100644 index 000000000000..0c12ce9a9b45 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/trivial-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Trivial RTCs + +maintainers: + - Alexandre Belloni + +description: | + This is a list of trivial RTC devices that have simple device tree + bindings, consisting only of a compatible field, an address and + possibly an interrupt line. + +allOf: + - $ref: "rtc.yaml#" + +properties: + compatible: + enum: + # AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface + - abracon,abb5zes3 + # AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface + - abracon,abeoz9 + # I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output + - dallas,ds1374 + # Dallas DS1672 Real-time Clock + - dallas,ds1672 + # Extremely Accurate I²C RTC with Integrated Crystal and SRAM + - dallas,ds3232 + # I2C-BUS INTERFACE REAL TIME CLOCK MODULE + - epson,rx8010 + # I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM + - epson,rx8571 + # I2C-BUS INTERFACE REAL TIME CLOCK MODULE + - epson,rx8581 + # Intersil ISL1208 Low Power RTC with Battery Backed SRAM + - isil,isl1208 + # Intersil ISL1218 Low Power RTC with Battery Backed SRAM + - isil,isl1218 + # Intersil ISL12022 Real-time Clock + - isil,isl12022 + # Real Time Clock Module with I2C-Bus + - microcrystal,rv3028 + # Real Time Clock Module with I2C-Bus + - microcrystal,rv3029 + # Real Time Clock + - microcrystal,rv8523 + # Real-time clock + - nxp,pcf2127 + # Real-time clock + - nxp,pcf2129 + # Real-time clock/calendar + - nxp,pcf8563 + # Real-time Clock Module + - pericom,pt7c4338 + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,r2025sd + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,r2221tl + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,rs5c372a + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,rs5c372b + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,rv5c386 + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - ricoh,rv5c387a + # 2-wire CMOS real-time clock + - sii,s35390a + # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC + - whwave,sd3078 + # Xircom X1205 I2C RTC + - xircom,x1205 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + start-year: true + +required: + - compatible + - reg + +additionalProperties: false + +... -- cgit From 36e63ef3582b7cf68ad21fea056a278f9ccef6eb Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 May 2019 22:30:33 +0200 Subject: dt-bindings: rtc: Convert Allwinner A10 RTC to a schema The older Allwinner SoCs have an embedded RTC supported in Linux, with a matching Device Tree binding. Now that we have the DT validation in place, let's convert the device tree bindings for that controller over to a YAML schemas. Acked-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard Signed-off-by: Alexandre Belloni --- .../bindings/rtc/allwinner,sun4i-a10-rtc.yaml | 43 ++++++++++++++++++++++ .../devicetree/bindings/rtc/sunxi-rtc.txt | 17 --------- 2 files changed, 43 insertions(+), 17 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml delete mode 100644 Documentation/devicetree/bindings/rtc/sunxi-rtc.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml new file mode 100644 index 000000000000..46d69c32b89b --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/allwinner,sun4i-a10-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 RTC Device Tree Bindings + +allOf: + - $ref: "rtc.yaml#" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + compatible: + enum: + - allwinner,sun4i-a10-rtc + - allwinner,sun7i-a20-rtc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + rtc: rtc@1c20d00 { + compatible = "allwinner,sun4i-a10-rtc"; + reg = <0x01c20d00 0x20>; + interrupts = <24>; + }; + +... diff --git a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt b/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt deleted file mode 100644 index 4a8d79c1cf08..000000000000 --- a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt +++ /dev/null @@ -1,17 +0,0 @@ -* sun4i/sun7i Real Time Clock - -RTC controller for the Allwinner A10/A20 - -Required properties: -- compatible : Should be "allwinner,sun4i-a10-rtc" or "allwinner,sun7i-a20-rtc" -- reg: physical base address of the controller and length of memory mapped - region. -- interrupts: IRQ line for the RTC. - -Example: - -rtc: rtc@1c20d00 { - compatible = "allwinner,sun4i-a10-rtc"; - reg = <0x01c20d00 0x20>; - interrupts = <24>; -}; -- cgit From 5a0797599b472d6fa35bed483e67c02c380ef041 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 May 2019 22:30:34 +0200 Subject: dt-bindings: rtc: Convert Allwinner A31 RTC to a schema The newer Allwinner SoCs have an embedded RTC supported in Linux, with a matching Device Tree binding. Now that we have the DT validation in place, let's convert the device tree bindings for that controller over to a YAML schemas. Acked-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard Signed-off-by: Alexandre Belloni --- .../bindings/rtc/allwinner,sun6i-a31-rtc.yaml | 115 +++++++++++++++++++++ .../devicetree/bindings/rtc/sun6i-rtc.txt | 46 --------- 2 files changed, 115 insertions(+), 46 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml delete mode 100644 Documentation/devicetree/bindings/rtc/sun6i-rtc.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml new file mode 100644 index 000000000000..f154bbba6a69 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/allwinner,sun6i-a31-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A31 RTC Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#clock-cells": + const: 1 + + compatible: + oneOf: + - const: allwinner,sun6i-a31-rtc + - const: allwinner,sun8i-a23-rtc + - const: allwinner,sun8i-h3-rtc + - items: + - const: allwinner,sun8i-r40-rtc + - const: allwinner,sun8i-h3-rtc + - const: allwinner,sun8i-v3-rtc + - const: allwinner,sun50i-h5-rtc + - items: + - const: allwinner,sun50i-a64-rtc + - const: allwinner,sun8i-h3-rtc + + reg: + maxItems: 1 + + interrupts: + items: + - description: RTC Alarm 0 + - description: RTC Alarm 1 + + clocks: + maxItems: 1 + + clock-output-names: + minItems: 1 + maxItems: 3 + description: + The RTC provides up to three clocks + - the Low Frequency Oscillator or LOSC, at index 0, + - the Low Frequency Oscillator External output (X32KFOUT in + the datasheet), at index 1, + - the Internal Oscillator, at index 2. + +allOf: + - $ref: "rtc.yaml#" + - if: + properties: + compatible: + contains: + const: allwinner,sun6i-a31-rtc + + then: + properties: + clock-output-names: + minItems: 1 + maxItems: 1 + + - if: + properties: + compatible: + contains: + enum: + - allwinner,sun8i-a23-rtc + - allwinner,sun8i-v3-rtc + + then: + properties: + clock-output-names: + minItems: 2 + maxItems: 2 + + - if: + properties: + compatible: + contains: + enum: + - allwinner,sun8i-h3-rtc + - allwinner,sun50i-h5-rtc + + then: + properties: + clock-output-names: + minItems: 3 + maxItems: 3 + +required: + - "#clock-cells" + - compatible + - reg + - interrupts + - clocks + - clock-output-names + +additionalProperties: false + +examples: + - | + rtc: rtc@1f00000 { + compatible = "allwinner,sun6i-a31-rtc"; + reg = <0x01f00000 0x400>; + interrupts = <0 40 4>, <0 41 4>; + clock-output-names = "osc32k"; + clocks = <&ext_osc32k>; + #clock-cells = <1>; + }; + +... diff --git a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt deleted file mode 100644 index 6b732c41392b..000000000000 --- a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt +++ /dev/null @@ -1,46 +0,0 @@ -* sun6i Real Time Clock - -RTC controller for the Allwinner A31 - -Required properties: -- compatible : Should be one of the following combinations: - - "allwinner,sun6i-a31-rtc" - - "allwinner,sun8i-a23-rtc" - - "allwinner,sun8i-h3-rtc" - - "allwinner,sun8i-r40-rtc", "allwinner,sun8i-h3-rtc" - - "allwinner,sun8i-v3-rtc" - - "allwinner,sun50i-a64-rtc", "allwinner,sun8i-h3-rtc" - - "allwinner,sun50i-h5-rtc" - - Where there are two or more compatible strings, this - denotes the hardware covered by the most specific one - is backward-compatible with the latter ones, and the - implementation for the latter ones can be used, albeit - with reduced functionality. - -- reg : physical base address of the controller and length of - memory mapped region. -- interrupts : IRQ lines for the RTC alarm 0 and alarm 1, in that order. - -Required properties for new device trees -- clocks : phandle to the 32kHz external oscillator -- clock-output-names : names of up to three clock outputs. See below. -- #clock-cells : must be equal to 1. - -The RTC provides the following clocks at the given indices: -- 0: LOSC -- 1: LOSC external output, known as X32KFOUT in the datasheet. - This clock is not available on the A31 and is deprecated for old - device trees still using the "allwinner,sun6i-a31-rtc" compatible. -- 2: InternalOSC, or internal RC oscillator (A64/H3/H5 only) - -Example: - -rtc: rtc@1f00000 { - compatible = "allwinner,sun6i-a31-rtc"; - reg = <0x01f00000 0x400>; - interrupts = <0 40 4>, <0 41 4>; - clock-output-names = "osc32k"; - clocks = <&ext_osc32k>; - #clock-cells = <1>; -}; -- cgit From f5a336496e0f9aacb451b3b72eb44f78b3fe604c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 May 2019 22:30:35 +0200 Subject: dt-bindings: rtc: sun6i: Add the R40 RTC compatible The R40 has a pretty different RTC compared to the other SoCs we've encountered so far, the most important difference being that it now has only a single interrupt, compared to the previous SoCs having two. Let's add a compatible for that. Acked-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard Signed-off-by: Alexandre Belloni --- .../bindings/rtc/allwinner,sun6i-a31-rtc.yaml | 25 +++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml index f154bbba6a69..924622f39c44 100644 --- a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml @@ -19,9 +19,7 @@ properties: - const: allwinner,sun6i-a31-rtc - const: allwinner,sun8i-a23-rtc - const: allwinner,sun8i-h3-rtc - - items: - - const: allwinner,sun8i-r40-rtc - - const: allwinner,sun8i-h3-rtc + - const: allwinner,sun8i-r40-rtc - const: allwinner,sun8i-v3-rtc - const: allwinner,sun50i-h5-rtc - items: @@ -32,6 +30,8 @@ properties: maxItems: 1 interrupts: + minItems: 1 + maxItems: 2 items: - description: RTC Alarm 0 - description: RTC Alarm 1 @@ -69,6 +69,7 @@ allOf: contains: enum: - allwinner,sun8i-a23-rtc + - allwinner,sun8i-r40-rtc - allwinner,sun8i-v3-rtc then: @@ -91,6 +92,24 @@ allOf: minItems: 3 maxItems: 3 + - if: + properties: + compatible: + contains: + const: allwinner,sun8i-r40-rtc + + then: + properties: + interrupts: + minItems: 1 + maxItems: 1 + + else: + properties: + interrupts: + minItems: 2 + maxItems: 2 + required: - "#clock-cells" - compatible -- cgit From ee5dc0491c38ae4e4e583d7532d470754bb173f6 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 4 Jun 2019 10:26:56 +1000 Subject: docs: filesystems: vfs: Render method descriptions Currently vfs.rst does not render well into HTML the method descriptions for VFS data structures. We can improve the HTML output by putting the description string on a new line following the method name. Suggested-by: Jonathan Corbet Signed-off-by: Tobin C. Harding Signed-off-by: Jonathan Corbet --- Documentation/filesystems/vfs.rst | 1147 +++++++++++++++++++++---------------- 1 file changed, 642 insertions(+), 505 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 2ffbdf5f392c..0f85ab21c2ca 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -125,35 +125,46 @@ members are defined: struct lock_class_key s_umount_key; }; -``name``: the name of the filesystem type, such as "ext2", "iso9660", +``name`` + the name of the filesystem type, such as "ext2", "iso9660", "msdos" and so on -``fs_flags``: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) +``fs_flags`` + various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) -``mount``: the method to call when a new instance of this filesystem should -be mounted +``mount`` + the method to call when a new instance of this filesystem should + be mounted -``kill_sb``: the method to call when an instance of this filesystem - should be shut down +``kill_sb`` + the method to call when an instance of this filesystem should be + shut down -``owner``: for internal VFS use: you should initialize this to THIS_MODULE in - most cases. -``next``: for internal VFS use: you should initialize this to NULL +``owner`` + for internal VFS use: you should initialize this to THIS_MODULE + in most cases. + +``next`` + for internal VFS use: you should initialize this to NULL s_lock_key, s_umount_key: lockdep-specific The mount() method has the following arguments: -``struct file_system_type *fs_type``: describes the filesystem, partly initialized - by the specific filesystem code +``struct file_system_type *fs_type`` + describes the filesystem, partly initialized by the specific + filesystem code -``int flags``: mount flags +``int flags`` + mount flags -``const char *dev_name``: the device name we are mounting. +``const char *dev_name`` + the device name we are mounting. -``void *data``: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) +``void *data`` + arbitrary mount options, usually comes as an ASCII string (see + "Mount Options" section) The mount() method must return the root dentry of the tree requested by caller. An active reference to its superblock must be grabbed and the @@ -178,22 +189,27 @@ implementation. Usually, a filesystem uses one of the generic mount() implementations and provides a fill_super() callback instead. The generic variants are: -``mount_bdev``: mount a filesystem residing on a block device +``mount_bdev`` + mount a filesystem residing on a block device -``mount_nodev``: mount a filesystem that is not backed by a device +``mount_nodev`` + mount a filesystem that is not backed by a device -``mount_single``: mount a filesystem which shares the instance between - all mounts +``mount_single`` + mount a filesystem which shares the instance between all mounts A fill_super() callback implementation has the following arguments: -``struct super_block *sb``: the superblock structure. The callback - must initialize this properly. +``struct super_block *sb`` + the superblock structure. The callback must initialize this + properly. -``void *data``: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) +``void *data`` + arbitrary mount options, usually comes as an ASCII string (see + "Mount Options" section) -``int silent``: whether or not to be silent on error +``int silent`` + whether or not to be silent on error The Superblock Object @@ -240,87 +256,106 @@ noted. This means that most methods can block safely. All methods are only called from a process context (i.e. not from an interrupt handler or bottom half). -``alloc_inode``: this method is called by alloc_inode() to allocate memory - for struct inode and initialize it. If this function is not +``alloc_inode`` + this method is called by alloc_inode() to allocate memory for + struct inode and initialize it. If this function is not defined, a simple 'struct inode' is allocated. Normally alloc_inode will be used to allocate a larger structure which contains a 'struct inode' embedded within it. -``destroy_inode``: this method is called by destroy_inode() to release - resources allocated for struct inode. It is only required if +``destroy_inode`` + this method is called by destroy_inode() to release resources + allocated for struct inode. It is only required if ->alloc_inode was defined and simply undoes anything done by ->alloc_inode. -``dirty_inode``: this method is called by the VFS to mark an inode dirty. +``dirty_inode`` + this method is called by the VFS to mark an inode dirty. -``write_inode``: this method is called when the VFS needs to write an - inode to disc. The second parameter indicates whether the write - should be synchronous or not, not all filesystems check this flag. +``write_inode`` + this method is called when the VFS needs to write an inode to + disc. The second parameter indicates whether the write should + be synchronous or not, not all filesystems check this flag. -``drop_inode``: called when the last access to the inode is dropped, - with the inode->i_lock spinlock held. +``drop_inode`` + called when the last access to the inode is dropped, with the + inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do not - want to cache inodes - causing "delete_inode" to always be + semantics) or "generic_delete_inode" (for filesystems that do + not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the - old practice of using "force_delete" in the put_inode() case, - but does not have the races that the "force_delete()" approach - had. + The "generic_delete_inode()" behavior is equivalent to the old + practice of using "force_delete" in the put_inode() case, but + does not have the races that the "force_delete()" approach had. -``delete_inode``: called when the VFS wants to delete an inode +``delete_inode`` + called when the VFS wants to delete an inode -``put_super``: called when the VFS wishes to free the superblock +``put_super`` + called when the VFS wishes to free the superblock (i.e. unmount). This is called with the superblock lock held -``sync_fs``: called when VFS is writing out all dirty data associated with - a superblock. The second parameter indicates whether the method +``sync_fs`` + called when VFS is writing out all dirty data associated with a + superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. -``freeze_fs``: called when VFS is locking a filesystem and - forcing it into a consistent state. This method is currently - used by the Logical Volume Manager (LVM). +``freeze_fs`` + called when VFS is locking a filesystem and forcing it into a + consistent state. This method is currently used by the Logical + Volume Manager (LVM). -``unfreeze_fs``: called when VFS is unlocking a filesystem and making it writable +``unfreeze_fs`` + called when VFS is unlocking a filesystem and making it writable again. -``statfs``: called when the VFS needs to get filesystem statistics. +``statfs`` + called when the VFS needs to get filesystem statistics. -``remount_fs``: called when the filesystem is remounted. This is called - with the kernel lock held +``remount_fs`` + called when the filesystem is remounted. This is called with + the kernel lock held -``clear_inode``: called then the VFS clears the inode. Optional +``clear_inode`` + called then the VFS clears the inode. Optional -``umount_begin``: called when the VFS is unmounting a filesystem. +``umount_begin`` + called when the VFS is unmounting a filesystem. -``show_options``: called by the VFS to show mount options for - /proc//mounts. (see "Mount Options" section) +``show_options`` + called by the VFS to show mount options for /proc//mounts. + (see "Mount Options" section) -``quota_read``: called by the VFS to read from filesystem quota file. +``quota_read`` + called by the VFS to read from filesystem quota file. -``quota_write``: called by the VFS to write to filesystem quota file. +``quota_write`` + called by the VFS to write to filesystem quota file. -``nr_cached_objects``: called by the sb cache shrinking function for the - filesystem to return the number of freeable cached objects it contains. +``nr_cached_objects`` + called by the sb cache shrinking function for the filesystem to + return the number of freeable cached objects it contains. Optional. -``free_cache_objects``: called by the sb cache shrinking function for the - filesystem to scan the number of objects indicated to try to free them. - Optional, but any filesystem implementing this method needs to also - implement ->nr_cached_objects for it to be called correctly. +``free_cache_objects`` + called by the sb cache shrinking function for the filesystem to + scan the number of objects indicated to try to free them. + Optional, but any filesystem implementing this method needs to + also implement ->nr_cached_objects for it to be called + correctly. We can't do anything with any errors that the filesystem might - encountered, hence the void return type. This will never be called if - the VM is trying to reclaim under GFP_NOFS conditions, hence this - method does not need to handle that situation itself. + encountered, hence the void return type. This will never be + called if the VM is trying to reclaim under GFP_NOFS conditions, + hence this method does not need to handle that situation itself. - Implementations must include conditional reschedule calls inside any - scanning loop that is done. This allows the VFS to determine - appropriate scan batch sizes without having to worry about whether - implementations will cause holdoff problems due to large scan batch - sizes. + Implementations must include conditional reschedule calls inside + any scanning loop that is done. This allows the VFS to + determine appropriate scan batch sizes without having to worry + about whether implementations will cause holdoff problems due to + large scan batch sizes. Whoever sets up the inode is responsible for filling in the "i_op" field. This is a pointer to a "struct inode_operations" which describes @@ -334,23 +369,31 @@ On filesystems that support extended attributes (xattrs), the s_xattr superblock field points to a NULL-terminated array of xattr handlers. Extended attributes are name:value pairs. -``name``: Indicates that the handler matches attributes with the specified name - (such as "system.posix_acl_access"); the prefix field must be NULL. +``name`` + Indicates that the handler matches attributes with the specified + name (such as "system.posix_acl_access"); the prefix field must + be NULL. -``prefix``: Indicates that the handler matches all attributes with the specified - name prefix (such as "user."); the name field must be NULL. +``prefix`` + Indicates that the handler matches all attributes with the + specified name prefix (such as "user."); the name field must be + NULL. -``list``: Determine if attributes matching this xattr handler should be listed - for a particular dentry. Used by some listxattr implementations like - generic_listxattr. +``list`` + Determine if attributes matching this xattr handler should be + listed for a particular dentry. Used by some listxattr + implementations like generic_listxattr. -``get``: Called by the VFS to get the value of a particular extended attribute. - This method is called by the getxattr(2) system call. +``get`` + Called by the VFS to get the value of a particular extended + attribute. This method is called by the getxattr(2) system + call. -``set``: Called by the VFS to set the value of a particular extended attribute. - When the new value is NULL, called to remove a particular extended - attribute. This method is called by the the setxattr(2) and - removexattr(2) system calls. +``set`` + Called by the VFS to set the value of a particular extended + attribute. When the new value is NULL, called to remove a + particular extended attribute. This method is called by the the + setxattr(2) and removexattr(2) system calls. When none of the xattr handlers of a filesystem match the specified attribute name or when a filesystem doesn't support extended attributes, @@ -399,128 +442,147 @@ As of kernel 2.6.22, the following members are defined: Again, all methods are called without any locks being held, unless otherwise noted. -``create``: called by the open(2) and creat(2) system calls. Only - required if you want to support regular files. The dentry you - get should not have an inode (i.e. it should be a negative - dentry). Here you will probably call d_instantiate() with the - dentry and the newly created inode +``create`` + called by the open(2) and creat(2) system calls. Only required + if you want to support regular files. The dentry you get should + not have an inode (i.e. it should be a negative dentry). Here + you will probably call d_instantiate() with the dentry and the + newly created inode -``lookup``: called when the VFS needs to look up an inode in a parent +``lookup`` + called when the VFS needs to look up an inode in a parent directory. The name to look for is found in the dentry. This method must call d_add() to insert the found inode into the dentry. The "i_count" field in the inode structure should be incremented. If the named inode does not exist a NULL inode should be inserted into the dentry (this is called a negative - dentry). Returning an error code from this routine must only - be done on a real error, otherwise creating inodes with system + dentry). Returning an error code from this routine must only be + done on a real error, otherwise creating inodes with system calls like create(2), mknod(2), mkdir(2) and so on will fail. If you wish to overload the dentry methods then you should - initialise the "d_dop" field in the dentry; this is a pointer - to a struct "dentry_operations". - This method is called with the directory inode semaphore held + initialise the "d_dop" field in the dentry; this is a pointer to + a struct "dentry_operations". This method is called with the + directory inode semaphore held -``link``: called by the link(2) system call. Only required if you want - to support hard links. You will probably need to call +``link`` + called by the link(2) system call. Only required if you want to + support hard links. You will probably need to call d_instantiate() just as you would in the create() method -``unlink``: called by the unlink(2) system call. Only required if you - want to support deleting inodes +``unlink`` + called by the unlink(2) system call. Only required if you want + to support deleting inodes -``symlink``: called by the symlink(2) system call. Only required if you - want to support symlinks. You will probably need to call +``symlink`` + called by the symlink(2) system call. Only required if you want + to support symlinks. You will probably need to call d_instantiate() just as you would in the create() method -``mkdir``: called by the mkdir(2) system call. Only required if you want +``mkdir`` + called by the mkdir(2) system call. Only required if you want to support creating subdirectories. You will probably need to call d_instantiate() just as you would in the create() method -``rmdir``: called by the rmdir(2) system call. Only required if you want +``rmdir`` + called by the rmdir(2) system call. Only required if you want to support deleting subdirectories -``mknod``: called by the mknod(2) system call to create a device (char, - block) inode or a named pipe (FIFO) or socket. Only required - if you want to support creating these types of inodes. You - will probably need to call d_instantiate() just as you would - in the create() method +``mknod`` + called by the mknod(2) system call to create a device (char, + block) inode or a named pipe (FIFO) or socket. Only required if + you want to support creating these types of inodes. You will + probably need to call d_instantiate() just as you would in the + create() method -``rename``: called by the rename(2) system call to rename the object to - have the parent and name given by the second inode and dentry. +``rename`` + called by the rename(2) system call to rename the object to have + the parent and name given by the second inode and dentry. The filesystem must return -EINVAL for any unsupported or - unknown flags. Currently the following flags are implemented: - (1) RENAME_NOREPLACE: this flag indicates that if the target - of the rename exists the rename should fail with -EEXIST - instead of replacing the target. The VFS already checks for - existence, so for local filesystems the RENAME_NOREPLACE - implementation is equivalent to plain rename. + unknown flags. Currently the following flags are implemented: + (1) RENAME_NOREPLACE: this flag indicates that if the target of + the rename exists the rename should fail with -EEXIST instead of + replacing the target. The VFS already checks for existence, so + for local filesystems the RENAME_NOREPLACE implementation is + equivalent to plain rename. (2) RENAME_EXCHANGE: exchange source and target. Both must - exist; this is checked by the VFS. Unlike plain rename, - source and target may be of different type. - -``get_link``: called by the VFS to follow a symbolic link to the - inode it points to. Only required if you want to support - symbolic links. This method returns the symlink body - to traverse (and possibly resets the current position with - nd_jump_link()). If the body won't go away until the inode - is gone, nothing else is needed; if it needs to be otherwise - pinned, arrange for its release by having get_link(..., ..., done) - do set_delayed_call(done, destructor, argument). - In that case destructor(argument) will be called once VFS is - done with the body you've returned. - May be called in RCU mode; that is indicated by NULL dentry + exist; this is checked by the VFS. Unlike plain rename, source + and target may be of different type. + +``get_link`` + called by the VFS to follow a symbolic link to the inode it + points to. Only required if you want to support symbolic links. + This method returns the symlink body to traverse (and possibly + resets the current position with nd_jump_link()). If the body + won't go away until the inode is gone, nothing else is needed; + if it needs to be otherwise pinned, arrange for its release by + having get_link(..., ..., done) do set_delayed_call(done, + destructor, argument). In that case destructor(argument) will + be called once VFS is done with the body you've returned. May + be called in RCU mode; that is indicated by NULL dentry argument. If request can't be handled without leaving RCU mode, have it return ERR_PTR(-ECHILD). - If the filesystem stores the symlink target in ->i_link, the VFS may use it directly without calling ->get_link(); however, ->get_link() must still be provided. ->i_link must not be freed until after an RCU grace period. Writing to ->i_link post-iget() time requires a 'release' memory barrier. -``readlink``: this is now just an override for use by readlink(2) for the +``readlink`` + this is now just an override for use by readlink(2) for the cases when ->get_link uses nd_jump_link() or object is not in fact a symlink. Normally filesystems should only implement ->get_link for symlinks and readlink(2) will automatically use that. -``permission``: called by the VFS to check for access rights on a POSIX-like +``permission`` + called by the VFS to check for access rights on a POSIX-like filesystem. - May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or - storing to the inode. + May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in + rcu-walk mode, the filesystem must check the permission without + blocking or storing to the inode. - If a situation is encountered that rcu-walk cannot handle, return + If a situation is encountered that rcu-walk cannot handle, + return -ECHILD and it will be called again in ref-walk mode. -``setattr``: called by the VFS to set attributes for a file. This method - is called by chmod(2) and related system calls. - -``getattr``: called by the VFS to get attributes of a file. This method - is called by stat(2) and related system calls. - -``listxattr``: called by the VFS to list all extended attributes for a - given file. This method is called by the listxattr(2) system call. - -``update_time``: called by the VFS to update a specific time or the i_version of - an inode. If this is not defined the VFS will update the inode itself - and call mark_inode_dirty_sync. - -``atomic_open``: called on the last component of an open. Using this optional - method the filesystem can look up, possibly create and open the file in - one atomic operation. If it wants to leave actual opening to the - caller (e.g. if the file turned out to be a symlink, device, or just - something filesystem won't do atomic open for), it may signal this by - returning finish_no_open(file, dentry). This method is only called if - the last component is negative or needs lookup. Cached positive dentries - are still handled by f_op->open(). If the file was created, - FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL - the method must only succeed if the file didn't exist and hence FMODE_CREATED - shall always be set on success. - -``tmpfile``: called in the end of O_TMPFILE open(). Optional, equivalent to - atomically creating, opening and unlinking a file in given directory. +``setattr`` + called by the VFS to set attributes for a file. This method is + called by chmod(2) and related system calls. + +``getattr`` + called by the VFS to get attributes of a file. This method is + called by stat(2) and related system calls. + +``listxattr`` + called by the VFS to list all extended attributes for a given + file. This method is called by the listxattr(2) system call. + +``update_time`` + called by the VFS to update a specific time or the i_version of + an inode. If this is not defined the VFS will update the inode + itself and call mark_inode_dirty_sync. + +``atomic_open`` + called on the last component of an open. Using this optional + method the filesystem can look up, possibly create and open the + file in one atomic operation. If it wants to leave actual + opening to the caller (e.g. if the file turned out to be a + symlink, device, or just something filesystem won't do atomic + open for), it may signal this by returning finish_no_open(file, + dentry). This method is only called if the last component is + negative or needs lookup. Cached positive dentries are still + handled by f_op->open(). If the file was created, FMODE_CREATED + flag should be set in file->f_mode. In case of O_EXCL the + method must only succeed if the file didn't exist and hence + FMODE_CREATED shall always be set on success. + +``tmpfile`` + called in the end of O_TMPFILE open(). Optional, equivalent to + atomically creating, opening and unlinking a file in given + directory. The Address Space Object @@ -673,70 +735,75 @@ cache in your filesystem. The following members are defined: int (*swap_deactivate)(struct file *); }; -``writepage``: called by the VM to write a dirty page to backing store. - This may happen for data integrity reasons (i.e. 'sync'), or - to free up memory (flush). The difference can be seen in - wbc->sync_mode. - The PG_Dirty flag has been cleared and PageLocked is true. - writepage should start writeout, should set PG_Writeback, - and should make sure the page is unlocked, either synchronously - or asynchronously when the write operation completes. - - If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to - try too hard if there are problems, and may choose to write out - other pages from the mapping if that is easier (e.g. due to - internal dependencies). If it chooses not to start writeout, it - should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep - calling ->writepage on that page. - - See the file "Locking" for more details. - -``readpage``: called by the VM to read a page from backing store. - The page will be Locked when readpage is called, and should be - unlocked and marked uptodate once the read completes. - If ->readpage discovers that it needs to unlock the page for - some reason, it can do so, and then return AOP_TRUNCATED_PAGE. - In this case, the page will be relocated, relocked and if - that all succeeds, ->readpage will be called again. - -``writepages``: called by the VM to write out pages associated with the +``writepage`` + called by the VM to write a dirty page to backing store. This + may happen for data integrity reasons (i.e. 'sync'), or to free + up memory (flush). The difference can be seen in + wbc->sync_mode. The PG_Dirty flag has been cleared and + PageLocked is true. writepage should start writeout, should set + PG_Writeback, and should make sure the page is unlocked, either + synchronously or asynchronously when the write operation + completes. + + If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to + try too hard if there are problems, and may choose to write out + other pages from the mapping if that is easier (e.g. due to + internal dependencies). If it chooses not to start writeout, it + should return AOP_WRITEPAGE_ACTIVATE so that the VM will not + keep calling ->writepage on that page. + + See the file "Locking" for more details. + +``readpage`` + called by the VM to read a page from backing store. The page + will be Locked when readpage is called, and should be unlocked + and marked uptodate once the read completes. If ->readpage + discovers that it needs to unlock the page for some reason, it + can do so, and then return AOP_TRUNCATED_PAGE. In this case, + the page will be relocated, relocked and if that all succeeds, + ->readpage will be called again. + +``writepages`` + called by the VM to write out pages associated with the address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then the writeback_control will specify a range of pages that must be - written out. If it is WBC_SYNC_NONE, then a nr_to_write is given - and that many pages should be written if possible. - If no ->writepages is given, then mpage_writepages is used - instead. This will choose pages from the address space that are - tagged as DIRTY and will pass them to ->writepage. - -``set_page_dirty``: called by the VM to set a page dirty. - This is particularly needed if an address space attaches - private data to a page, and that data needs to be updated when - a page is dirtied. This is called, for example, when a memory - mapped page gets modified. + written out. If it is WBC_SYNC_NONE, then a nr_to_write is + given and that many pages should be written if possible. If no + ->writepages is given, then mpage_writepages is used instead. + This will choose pages from the address space that are tagged as + DIRTY and will pass them to ->writepage. + +``set_page_dirty`` + called by the VM to set a page dirty. This is particularly + needed if an address space attaches private data to a page, and + that data needs to be updated when a page is dirtied. This is + called, for example, when a memory mapped page gets modified. If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. -``readpages``: called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of - readpage. Instead of just one page, several pages are - requested. +``readpages`` + called by the VM to read pages associated with the address_space + object. This is essentially just a vector version of readpage. + Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. -``write_begin``: - Called by the generic buffered write code to ask the filesystem to - prepare to write len bytes at the given offset in the file. The - address_space should check that the write will be able to complete, - by allocating space if necessary and doing any other internal - housekeeping. If the write will update parts of any basic-blocks on - storage, then those blocks should be pre-read (if they haven't been - read already) so that the updated blocks can be written out properly. +``write_begin`` + Called by the generic buffered write code to ask the filesystem + to prepare to write len bytes at the given offset in the file. + The address_space should check that the write will be able to + complete, by allocating space if necessary and doing any other + internal housekeeping. If the write will update parts of any + basic-blocks on storage, then those blocks should be pre-read + (if they haven't been read already) so that the updated blocks + can be written out properly. - The filesystem must return the locked pagecache page for the specified - offset, in ``*pagep``, for the caller to write into. + The filesystem must return the locked pagecache page for the + specified offset, in ``*pagep``, for the caller to write into. - It must be able to cope with short writes (where the length passed to - write_begin is greater than the number of bytes copied into the page). + It must be able to cope with short writes (where the length + passed to write_begin is greater than the number of bytes copied + into the page). flags is a field for AOP_FLAG_xxx flags, described in include/linux/fs.h. @@ -744,114 +811,128 @@ cache in your filesystem. The following members are defined: A void * may be returned in fsdata, which then gets passed into write_end. - Returns 0 on success; < 0 on failure (which is the error code), in - which case write_end is not called. - -``write_end``: After a successful write_begin, and data copy, write_end must - be called. len is the original len passed to write_begin, and copied - is the amount that was able to be copied. - - The filesystem must take care of unlocking the page and releasing it - refcount, and updating i_size. - - Returns < 0 on failure, otherwise the number of bytes (<= 'copied') - that were able to be copied into pagecache. - -``bmap``: called by the VFS to map a logical block offset within object to - physical block number. This method is used by the FIBMAP - ioctl and for working with swap-files. To be able to swap to - a file, the file must have a stable mapping to a block - device. The swap system does not go through the filesystem - but instead uses bmap to find out where the blocks in the file - are and uses those addresses directly. - -``invalidatepage``: If a page has PagePrivate set, then invalidatepage - will be called when part or all of the page is to be removed - from the address space. This generally corresponds to either a - truncation, punch hole or a complete invalidation of the address + Returns 0 on success; < 0 on failure (which is the error code), + in which case write_end is not called. + +``write_end`` + After a successful write_begin, and data copy, write_end must be + called. len is the original len passed to write_begin, and + copied is the amount that was able to be copied. + + The filesystem must take care of unlocking the page and + releasing it refcount, and updating i_size. + + Returns < 0 on failure, otherwise the number of bytes (<= + 'copied') that were able to be copied into pagecache. + +``bmap`` + called by the VFS to map a logical block offset within object to + physical block number. This method is used by the FIBMAP ioctl + and for working with swap-files. To be able to swap to a file, + the file must have a stable mapping to a block device. The swap + system does not go through the filesystem but instead uses bmap + to find out where the blocks in the file are and uses those + addresses directly. + +``invalidatepage`` + If a page has PagePrivate set, then invalidatepage will be + called when part or all of the page is to be removed from the + address space. This generally corresponds to either a + truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' will be PAGE_SIZE). Any private data associated with the page - should be updated to reflect this truncation. If offset is 0 and - length is PAGE_SIZE, then the private data should be released, - because the page must be able to be completely discarded. This may - be done by calling the ->releasepage function, but in this case the - release MUST succeed. - -``releasepage``: releasepage is called on PagePrivate pages to indicate - that the page should be freed if possible. ->releasepage - should remove any private data from the page and clear the - PagePrivate flag. If releasepage() fails for some reason, it must - indicate failure with a 0 return value. - releasepage() is used in two distinct though related cases. The - first is when the VM finds a clean page with no active users and - wants to make it a free page. If ->releasepage succeeds, the - page will be removed from the address_space and become free. + should be updated to reflect this truncation. If offset is 0 + and length is PAGE_SIZE, then the private data should be + released, because the page must be able to be completely + discarded. This may be done by calling the ->releasepage + function, but in this case the release MUST succeed. + +``releasepage`` + releasepage is called on PagePrivate pages to indicate that the + page should be freed if possible. ->releasepage should remove + any private data from the page and clear the PagePrivate flag. + If releasepage() fails for some reason, it must indicate failure + with a 0 return value. releasepage() is used in two distinct + though related cases. The first is when the VM finds a clean + page with no active users and wants to make it a free page. If + ->releasepage succeeds, the page will be removed from the + address_space and become free. The second case is when a request has been made to invalidate - some or all pages in an address_space. This can happen - through the fadvise(POSIX_FADV_DONTNEED) system call or by the - filesystem explicitly requesting it as nfs and 9fs do (when - they believe the cache may be out of date with storage) by - calling invalidate_inode_pages2(). - If the filesystem makes such a call, and needs to be certain - that all pages are invalidated, then its releasepage will - need to ensure this. Possibly it can clear the PageUptodate - bit if it cannot free private data yet. - -``freepage``: freepage is called once the page is no longer visible in - the page cache in order to allow the cleanup of any private - data. Since it may be called by the memory reclaimer, it - should not assume that the original address_space mapping still - exists, and it should not block. - -``direct_IO``: called by the generic read/write routines to perform - direct_IO - that is IO requests which bypass the page cache - and transfer data directly between the storage and the - application's address space. - -``isolate_page``: Called by the VM when isolating a movable non-lru page. - If page is successfully isolated, VM marks the page as PG_isolated - via __SetPageIsolated. - -``migrate_page``: This is used to compact the physical memory usage. - If the VM wants to relocate a page (maybe off a memory card - that is signalling imminent failure) it will pass a new page - and an old page to this function. migrate_page should - transfer any private data across and update any references - that it has to the page. - -``putback_page``: Called by the VM when isolated page's migration fails. - -``launder_page``: Called before freeing a page - it writes back the dirty page. To - prevent redirtying the page, it is kept locked during the whole - operation. - -``is_partially_uptodate``: Called by the VM when reading a file through the - pagecache when the underlying blocksize != pagesize. If the required - block is up to date then the read can complete without needing the IO - to bring the whole page up to date. - -``is_dirty_writeback``: Called by the VM when attempting to reclaim a page. - The VM uses dirty and writeback information to determine if it needs - to stall to allow flushers a chance to complete some IO. Ordinarily - it can use PageDirty and PageWriteback but some filesystems have - more complex state (unstable pages in NFS prevent reclaim) or - do not set those flags due to locking problems. This callback - allows a filesystem to indicate to the VM if a page should be - treated as dirty or writeback for the purposes of stalling. - -``error_remove_page``: normally set to generic_error_remove_page if truncation - is ok for this address space. Used for memory failure handling. + some or all pages in an address_space. This can happen through + the fadvise(POSIX_FADV_DONTNEED) system call or by the + filesystem explicitly requesting it as nfs and 9fs do (when they + believe the cache may be out of date with storage) by calling + invalidate_inode_pages2(). If the filesystem makes such a call, + and needs to be certain that all pages are invalidated, then its + releasepage will need to ensure this. Possibly it can clear the + PageUptodate bit if it cannot free private data yet. + +``freepage`` + freepage is called once the page is no longer visible in the + page cache in order to allow the cleanup of any private data. + Since it may be called by the memory reclaimer, it should not + assume that the original address_space mapping still exists, and + it should not block. + +``direct_IO`` + called by the generic read/write routines to perform direct_IO - + that is IO requests which bypass the page cache and transfer + data directly between the storage and the application's address + space. + +``isolate_page`` + Called by the VM when isolating a movable non-lru page. If page + is successfully isolated, VM marks the page as PG_isolated via + __SetPageIsolated. + +``migrate_page`` + This is used to compact the physical memory usage. If the VM + wants to relocate a page (maybe off a memory card that is + signalling imminent failure) it will pass a new page and an old + page to this function. migrate_page should transfer any private + data across and update any references that it has to the page. + +``putback_page`` + Called by the VM when isolated page's migration fails. + +``launder_page`` + Called before freeing a page - it writes back the dirty page. + To prevent redirtying the page, it is kept locked during the + whole operation. + +``is_partially_uptodate`` + Called by the VM when reading a file through the pagecache when + the underlying blocksize != pagesize. If the required block is + up to date then the read can complete without needing the IO to + bring the whole page up to date. + +``is_dirty_writeback`` + Called by the VM when attempting to reclaim a page. The VM uses + dirty and writeback information to determine if it needs to + stall to allow flushers a chance to complete some IO. + Ordinarily it can use PageDirty and PageWriteback but some + filesystems have more complex state (unstable pages in NFS + prevent reclaim) or do not set those flags due to locking + problems. This callback allows a filesystem to indicate to the + VM if a page should be treated as dirty or writeback for the + purposes of stalling. + +``error_remove_page`` + normally set to generic_error_remove_page if truncation is ok + for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. -``swap_activate``: Called when swapon is used on a file to allocate - space if necessary and pin the block lookup information in - memory. A return value of zero indicates success, - in which case this file can be used to back swapspace. +``swap_activate`` + Called when swapon is used on a file to allocate space if + necessary and pin the block lookup information in memory. A + return value of zero indicates success, in which case this file + can be used to back swapspace. -``swap_deactivate``: Called during swapoff on files where swap_activate - was successful. +``swap_deactivate`` + Called during swapoff on files where swap_activate was + successful. The File Object @@ -912,91 +993,120 @@ This describes how the VFS can manipulate an open file. As of kernel Again, all methods are called without any locks being held, unless otherwise noted. -``llseek``: called when the VFS needs to move the file position index +``llseek`` + called when the VFS needs to move the file position index -``read``: called by read(2) and related system calls +``read`` + called by read(2) and related system calls -``read_iter``: possibly asynchronous read with iov_iter as destination +``read_iter`` + possibly asynchronous read with iov_iter as destination -``write``: called by write(2) and related system calls +``write`` + called by write(2) and related system calls -``write_iter``: possibly asynchronous write with iov_iter as source +``write_iter`` + possibly asynchronous write with iov_iter as source -``iopoll``: called when aio wants to poll for completions on HIPRI iocbs +``iopoll`` + called when aio wants to poll for completions on HIPRI iocbs -``iterate``: called when the VFS needs to read the directory contents +``iterate`` + called when the VFS needs to read the directory contents -``iterate_shared``: called when the VFS needs to read the directory contents - when filesystem supports concurrent dir iterators +``iterate_shared`` + called when the VFS needs to read the directory contents when + filesystem supports concurrent dir iterators -``poll``: called by the VFS when a process wants to check if there is +``poll`` + called by the VFS when a process wants to check if there is activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls -``unlocked_ioctl``: called by the ioctl(2) system call. +``unlocked_ioctl`` + called by the ioctl(2) system call. -``compat_ioctl``: called by the ioctl(2) system call when 32 bit system calls - are used on 64 bit kernels. +``compat_ioctl`` + called by the ioctl(2) system call when 32 bit system calls are + used on 64 bit kernels. -``mmap``: called by the mmap(2) system call +``mmap`` + called by the mmap(2) system call -``open``: called by the VFS when an inode should be opened. When the VFS +``open`` + called by the VFS when an inode should be opened. When the VFS opens a file, it creates a new "struct file". It then calls the open method for the newly allocated file structure. You might - think that the open method really belongs in - "struct inode_operations", and you may be right. I think it's - done the way it is because it makes filesystems simpler to - implement. The open() method is a good place to initialize the + think that the open method really belongs in "struct + inode_operations", and you may be right. I think it's done the + way it is because it makes filesystems simpler to implement. + The open() method is a good place to initialize the "private_data" member in the file structure if you want to point to a device structure -``flush``: called by the close(2) system call to flush a file +``flush`` + called by the close(2) system call to flush a file -``release``: called when the last reference to an open file is closed +``release`` + called when the last reference to an open file is closed -``fsync``: called by the fsync(2) system call. Also see the section above - entitled "Handling errors during writeback". +``fsync`` + called by the fsync(2) system call. Also see the section above + entitled "Handling errors during writeback". -``fasync``: called by the fcntl(2) system call when asynchronous +``fasync`` + called by the fcntl(2) system call when asynchronous (non-blocking) mode is enabled for a file -``lock``: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW - commands +``lock`` + called by the fcntl(2) system call for F_GETLK, F_SETLK, and + F_SETLKW commands -``get_unmapped_area``: called by the mmap(2) system call +``get_unmapped_area`` + called by the mmap(2) system call -``check_flags``: called by the fcntl(2) system call for F_SETFL command +``check_flags`` + called by the fcntl(2) system call for F_SETFL command -``flock``: called by the flock(2) system call +``flock`` + called by the flock(2) system call -``splice_write``: called by the VFS to splice data from a pipe to a file. This - method is used by the splice(2) system call +``splice_write`` + called by the VFS to splice data from a pipe to a file. This + method is used by the splice(2) system call -``splice_read``: called by the VFS to splice data from file to a pipe. This - method is used by the splice(2) system call +``splice_read`` + called by the VFS to splice data from file to a pipe. This + method is used by the splice(2) system call -``setlease``: called by the VFS to set or release a file lock lease. setlease - implementations should call generic_setlease to record or remove - the lease in the inode after setting it. +``setlease`` + called by the VFS to set or release a file lock lease. setlease + implementations should call generic_setlease to record or remove + the lease in the inode after setting it. -``fallocate``: called by the VFS to preallocate blocks or punch a hole. +``fallocate`` + called by the VFS to preallocate blocks or punch a hole. -``copy_file_range``: called by the copy_file_range(2) system call. +``copy_file_range`` + called by the copy_file_range(2) system call. -``remap_file_range``: called by the ioctl(2) system call for FICLONERANGE and - FICLONE and FIDEDUPERANGE commands to remap file ranges. An - implementation should remap len bytes at pos_in of the source file into - the dest file at pos_out. Implementations must handle callers passing - in len == 0; this means "remap to the end of the source file". The - return value should the number of bytes remapped, or the usual - negative error code if errors occurred before any bytes were remapped. - The remap_flags parameter accepts REMAP_FILE_* flags. If - REMAP_FILE_DEDUP is set then the implementation must only remap if the - requested file ranges have identical contents. If REMAP_CAN_SHORTEN is - set, the caller is ok with the implementation shortening the request - length to satisfy alignment or EOF requirements (or any other reason). +``remap_file_range`` + called by the ioctl(2) system call for FICLONERANGE and FICLONE + and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source + file into the dest file at pos_out. Implementations must handle + callers passing in len == 0; this means "remap to the end of the + source file". The return value should the number of bytes + remapped, or the usual negative error code if errors occurred + before any bytes were remapped. The remap_flags parameter + accepts REMAP_FILE_* flags. If REMAP_FILE_DEDUP is set then the + implementation must only remap if the requested file ranges have + identical contents. If REMAP_CAN_SHORTEN is set, the caller is + ok with the implementation shortening the request length to + satisfy alignment or EOF requirements (or any other reason). -``fadvise``: possibly called by the fadvise64() system call. +``fadvise`` + possibly called by the fadvise64() system call. Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node @@ -1041,89 +1151,104 @@ defined: struct dentry *(*d_real)(struct dentry *, const struct inode *); }; -``d_revalidate``: called when the VFS needs to revalidate a dentry. This - is called whenever a name look-up finds a dentry in the - dcache. Most local filesystems leave this as NULL, because all their - dentries in the dcache are valid. Network filesystems are different - since things can change on the server without the client necessarily - being aware of it. - - This function should return a positive value if the dentry is still - valid, and zero or a negative error code if it isn't. - - d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). - If in rcu-walk mode, the filesystem must revalidate the dentry without - blocking or storing to the dentry, d_parent and d_inode should not be - used without care (because they can change and, in d_inode case, even - become NULL under us). - - If a situation is encountered that rcu-walk cannot handle, return +``d_revalidate`` + called when the VFS needs to revalidate a dentry. This is + called whenever a name look-up finds a dentry in the dcache. + Most local filesystems leave this as NULL, because all their + dentries in the dcache are valid. Network filesystems are + different since things can change on the server without the + client necessarily being aware of it. + + This function should return a positive value if the dentry is + still valid, and zero or a negative error code if it isn't. + + d_revalidate may be called in rcu-walk mode (flags & + LOOKUP_RCU). If in rcu-walk mode, the filesystem must + revalidate the dentry without blocking or storing to the dentry, + d_parent and d_inode should not be used without care (because + they can change and, in d_inode case, even become NULL under + us). + + If a situation is encountered that rcu-walk cannot handle, + return -ECHILD and it will be called again in ref-walk mode. -``_weak_revalidate``: called when the VFS needs to revalidate a "jumped" dentry. - This is called when a path-walk ends at dentry that was not acquired by - doing a lookup in the parent directory. This includes "/", "." and "..", - as well as procfs-style symlinks and mountpoint traversal. +``_weak_revalidate`` + called when the VFS needs to revalidate a "jumped" dentry. This + is called when a path-walk ends at dentry that was not acquired + by doing a lookup in the parent directory. This includes "/", + "." and "..", as well as procfs-style symlinks and mountpoint + traversal. - In this case, we are less concerned with whether the dentry is still - fully correct, but rather that the inode is still valid. As with - d_revalidate, most local filesystems will set this to NULL since their - dcache entries are always valid. + In this case, we are less concerned with whether the dentry is + still fully correct, but rather that the inode is still valid. + As with d_revalidate, most local filesystems will set this to + NULL since their dcache entries are always valid. - This function has the same return code semantics as d_revalidate. + This function has the same return code semantics as + d_revalidate. d_weak_revalidate is only called after leaving rcu-walk mode. -``d_hash``: called when the VFS adds a dentry to the hash table. The first +``d_hash`` + called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is to be hashed into. Same locking and synchronisation rules as d_compare regarding what is safe to dereference etc. -``d_compare``: called to compare a dentry name with a given name. The first +``d_compare`` + called to compare a dentry name with a given name. The first dentry is the parent of the dentry to be compared, the second is - the child dentry. len and name string are properties of the dentry - to be compared. qstr is the name to compare it with. + the child dentry. len and name string are properties of the + dentry to be compared. qstr is the name to compare it with. Must be constant and idempotent, and should not take locks if - possible, and should not or store into the dentry. - Should not dereference pointers outside the dentry without - lots of care (eg. d_parent, d_inode, d_name should not be used). - - However, our vfsmount is pinned, and RCU held, so the dentries and - inodes won't disappear, neither will our sb or filesystem module. - ->d_sb may be used. - - It is a tricky calling convention because it needs to be called under - "rcu-walk", ie. without any locks or references on things. - -``d_delete``: called when the last reference to a dentry is dropped and the - dcache is deciding whether or not to cache it. Return 1 to delete - immediately, or 0 to cache the dentry. Default is NULL which means to - always cache a reachable dentry. d_delete must be constant and - idempotent. - -``d_init``: called when a dentry is allocated - -``d_release``: called when a dentry is really deallocated - -``d_iput``: called when a dentry loses its inode (just prior to its - being deallocated). The default when this is NULL is that the - VFS calls iput(). If you define this method, you must call - iput() yourself - -``d_dname``: called when the pathname of a dentry should be generated. - Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay - pathname generation. (Instead of doing it when dentry is created, - it's done only when the path is needed.). Real filesystems probably - dont want to use it, because their dentries are present in global - dcache hash, so their hash should be an invariant. As no lock is - held, d_dname() should not try to modify the dentry itself, unless - appropriate SMP safety is used. CAUTION : d_path() logic is quite - tricky. The correct way to return for example "Hello" is to put it - at the end of the buffer, and returns a pointer to the first char. - dynamic_dname() helper function is provided to take care of this. + possible, and should not or store into the dentry. Should not + dereference pointers outside the dentry without lots of care + (eg. d_parent, d_inode, d_name should not be used). + + However, our vfsmount is pinned, and RCU held, so the dentries + and inodes won't disappear, neither will our sb or filesystem + module. ->d_sb may be used. + + It is a tricky calling convention because it needs to be called + under "rcu-walk", ie. without any locks or references on things. + +``d_delete`` + called when the last reference to a dentry is dropped and the + dcache is deciding whether or not to cache it. Return 1 to + delete immediately, or 0 to cache the dentry. Default is NULL + which means to always cache a reachable dentry. d_delete must + be constant and idempotent. + +``d_init`` + called when a dentry is allocated + +``d_release`` + called when a dentry is really deallocated + +``d_iput`` + called when a dentry loses its inode (just prior to its being + deallocated). The default when this is NULL is that the VFS + calls iput(). If you define this method, you must call iput() + yourself + +``d_dname`` + called when the pathname of a dentry should be generated. + Useful for some pseudo filesystems (sockfs, pipefs, ...) to + delay pathname generation. (Instead of doing it when dentry is + created, it's done only when the path is needed.). Real + filesystems probably dont want to use it, because their dentries + are present in global dcache hash, so their hash should be an + invariant. As no lock is held, d_dname() should not try to + modify the dentry itself, unless appropriate SMP safety is used. + CAUTION : d_path() logic is quite tricky. The correct way to + return for example "Hello" is to put it at the end of the + buffer, and returns a pointer to the first char. + dynamic_dname() helper function is provided to take care of + this. Example : @@ -1135,52 +1260,57 @@ defined: dentry->d_inode->i_ino); } -``d_automount``: called when an automount dentry is to be traversed (optional). - This should create a new VFS mount record and return the record to the - caller. The caller is supplied with a path parameter giving the - automount directory to describe the automount target and the parent - VFS mount record to provide inheritable mount parameters. NULL should - be returned if someone else managed to make the automount first. If - the vfsmount creation failed, then an error code should be returned. - If -EISDIR is returned, then the directory will be treated as an - ordinary directory and returned to pathwalk to continue walking. - - If a vfsmount is returned, the caller will attempt to mount it on the - mountpoint and will remove the vfsmount from its expiration list in - the case of failure. The vfsmount should be returned with 2 refs on - it to prevent automatic expiration - the caller will clean up the - additional ref. - - This function is only used if DCACHE_NEED_AUTOMOUNT is set on the - dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the - inode being added. - -``d_manage``: called to allow the filesystem to manage the transition from a - dentry (optional). This allows autofs, for example, to hold up clients - waiting to explore behind a 'mountpoint' while letting the daemon go - past and construct the subtree there. 0 should be returned to let the - calling process continue. -EISDIR can be returned to tell pathwalk to - use this directory as an ordinary directory and to ignore anything - mounted on it and not to check the automount flag. Any other error - code will abort pathwalk completely. +``d_automount`` + called when an automount dentry is to be traversed (optional). + This should create a new VFS mount record and return the record + to the caller. The caller is supplied with a path parameter + giving the automount directory to describe the automount target + and the parent VFS mount record to provide inheritable mount + parameters. NULL should be returned if someone else managed to + make the automount first. If the vfsmount creation failed, then + an error code should be returned. If -EISDIR is returned, then + the directory will be treated as an ordinary directory and + returned to pathwalk to continue walking. + + If a vfsmount is returned, the caller will attempt to mount it + on the mountpoint and will remove the vfsmount from its + expiration list in the case of failure. The vfsmount should be + returned with 2 refs on it to prevent automatic expiration - the + caller will clean up the additional ref. + + This function is only used if DCACHE_NEED_AUTOMOUNT is set on + the dentry. This is set by __d_instantiate() if S_AUTOMOUNT is + set on the inode being added. + +``d_manage`` + called to allow the filesystem to manage the transition from a + dentry (optional). This allows autofs, for example, to hold up + clients waiting to explore behind a 'mountpoint' while letting + the daemon go past and construct the subtree there. 0 should be + returned to let the calling process continue. -EISDIR can be + returned to tell pathwalk to use this directory as an ordinary + directory and to ignore anything mounted on it and not to check + the automount flag. Any other error code will abort pathwalk + completely. If the 'rcu_walk' parameter is true, then the caller is doing a - pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, - and the caller can be asked to leave it and call again by returning - -ECHILD. -EISDIR may also be returned to tell pathwalk to - ignore d_automount or any mounts. + pathwalk in RCU-walk mode. Sleeping is not permitted in this + mode, and the caller can be asked to leave it and call again by + returning -ECHILD. -EISDIR may also be returned to tell + pathwalk to ignore d_automount or any mounts. - This function is only used if DCACHE_MANAGE_TRANSIT is set on the - dentry being transited from. + This function is only used if DCACHE_MANAGE_TRANSIT is set on + the dentry being transited from. -``d_real``: overlay/union type filesystems implement this method to return one of - the underlying dentries hidden by the overlay. It is used in two - different modes: +``d_real`` + overlay/union type filesystems implement this method to return + one of the underlying dentries hidden by the overlay. It is + used in two different modes: - Called from file_dentry() it returns the real dentry matching the inode - argument. The real dentry may be from a lower layer already copied up, - but still referenced from the file. This mode is selected with a - non-NULL inode argument. + Called from file_dentry() it returns the real dentry matching + the inode argument. The real dentry may be from a lower layer + already copied up, but still referenced from the file. This + mode is selected with a non-NULL inode argument. With NULL inode the topmost real underlying dentry is returned. @@ -1195,40 +1325,47 @@ Directory Entry Cache API There are a number of functions defined which permit a filesystem to manipulate dentries: -``dget``: open a new handle for an existing dentry (this just increments +``dget`` + open a new handle for an existing dentry (this just increments the usage count) -``dput``: close a handle for a dentry (decrements the usage count). If +``dput`` + close a handle for a dentry (decrements the usage count). If the usage count drops to 0, and the dentry is still in its parent's hash, the "d_delete" method is called to check whether - it should be cached. If it should not be cached, or if the dentry - is not hashed, it is deleted. Otherwise cached dentries are put - into an LRU list to be reclaimed on memory shortage. - -``d_drop``: this unhashes a dentry from its parents hash list. A - subsequent call to dput() will deallocate the dentry if its - usage count drops to 0 - -``d_delete``: delete a dentry. If there are no other open references to - the dentry then the dentry is turned into a negative dentry - (the d_iput() method is called). If there are other - references, then d_drop() is called instead - -``d_add``: add a dentry to its parents hash list and then calls + it should be cached. If it should not be cached, or if the + dentry is not hashed, it is deleted. Otherwise cached dentries + are put into an LRU list to be reclaimed on memory shortage. + +``d_drop`` + this unhashes a dentry from its parents hash list. A subsequent + call to dput() will deallocate the dentry if its usage count + drops to 0 + +``d_delete`` + delete a dentry. If there are no other open references to the + dentry then the dentry is turned into a negative dentry (the + d_iput() method is called). If there are other references, then + d_drop() is called instead + +``d_add`` + add a dentry to its parents hash list and then calls d_instantiate() -``d_instantiate``: add a dentry to the alias hash list for the inode and - updates the "d_inode" member. The "i_count" member in the - inode structure should be set/incremented. If the inode - pointer is NULL, the dentry is called a "negative - dentry". This function is commonly called when an inode is - created for an existing negative dentry - -``d_lookup``: look up a dentry given its parent and path name component - It looks up the child of that given name from the dcache - hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use dput() - to free the dentry when it finishes using it. +``d_instantiate`` + add a dentry to the alias hash list for the inode and updates + the "d_inode" member. The "i_count" member in the inode + structure should be set/incremented. If the inode pointer is + NULL, the dentry is called a "negative dentry". This function + is commonly called when an inode is created for an existing + negative dentry + +``d_lookup`` + look up a dentry given its parent and path name component It + looks up the child of that given name from the dcache hash + table. If it is found, the reference count is incremented and + the dentry is returned. The caller must use dput() to free the + dentry when it finishes using it. Mount Options -- cgit From b422124758c19db06c4c30c4abb8f57bf18995b9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Jun 2019 19:39:44 +0300 Subject: docs/core-api: Add string helpers API to the list Some times string helpers are needed, but there is nothing about them in the generated documentation. Fill the gap by adding a reference to string_helpers.c exported functions. Signed-off-by: Andy Shevchenko Acked-by: Mike Rapoport Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index a53ec2eb8176..65ae2bf1f86d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -33,6 +33,9 @@ String Conversions .. kernel-doc:: lib/kstrtox.c :export: +.. kernel-doc:: lib/string_helpers.c + :export: + String Manipulation ------------------- -- cgit From 58d494669f36d0b61b7ec42c232877167ed3f5ce Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Jun 2019 19:51:13 +0300 Subject: docs/core-api: Add integer power functions to the list Some times integer power functions, such as int_sqrt(), are needed, but there is nothing about them in the generated documentation. Fill the gap by adding a reference to the corresponding exported functions. Signed-off-by: Andy Shevchenko Acked-by: Mike Rapoport Signed-off-by: Jonathan Corbet --- Documentation/core-api/kernel-api.rst | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 65ae2bf1f86d..824f24ccf401 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -141,6 +141,15 @@ Base 2 log and power Functions .. kernel-doc:: include/linux/log2.h :internal: +Integer power Functions +----------------------- + +.. kernel-doc:: lib/math/int_pow.c + :export: + +.. kernel-doc:: lib/math/int_sqrt.c + :export: + Division Functions ------------------ -- cgit From 4665743276c3e8efaa4da54eabe8f54b2f99e335 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 5 Jun 2019 07:56:30 -0500 Subject: dt-bindings: mfd: Add lm36274 bindings to ti-lmu Add the LM36274 backlight driver with regulator support. This is a multi-function device for backlight applications. Backlight properties will be documented in it's a supplemental bindings document. Regulator support is documented in the regulator/lm363x-regulator.txt http://www.ti.com/lit/ds/symlink/lm36274.pdf Signed-off-by: Dan Murphy Reviewed-by: Rob Herring Acked-by: Lee Jones Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- Documentation/devicetree/bindings/mfd/ti-lmu.txt | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt index 782d3c9812ed..2296b8f24de4 100644 --- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt +++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt @@ -8,6 +8,7 @@ TI LMU driver supports lighting devices below. LM3632 Backlight and regulator LM3633 Backlight, LED and fault monitor LM3695 Backlight + LM36274 Backlight and regulator Required properties: - compatible: Should be one of: @@ -15,11 +16,13 @@ Required properties: "ti,lm3632" "ti,lm3633" "ti,lm3695" + "ti,lm36274" - reg: I2C slave address. 0x11 for LM3632 0x29 for LM3631 0x36 for LM3633 0x63 for LM3695 + 0x11 for LM36274 Optional properties: - enable-gpios: A GPIO specifier for hardware enable pin. @@ -50,12 +53,14 @@ Optional nodes: - compatible: Should be one of: "ti,lm3633-fault-monitor" - leds: LED properties for LM3633. Please refer to [2]. + LED properties for LM36274. Please refer to [4]. - regulators: Regulator properties for LM3631 and LM3632. Please refer to [3]. [1] ../leds/backlight/ti-lmu-backlight.txt [2] ../leds/leds-lm3633.txt [3] ../regulator/lm363x-regulator.txt +[4] ../leds/leds-lm36274.txt lm3631@29 { compatible = "ti,lm3631"; @@ -213,3 +218,52 @@ lm3695@63 { }; }; }; + +lm36274@11 { + compatible = "ti,lm36274"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x11>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; + regulators { + #address-cells = <1>; + #size-cells = <0>; + compatible = "ti,lm363x-regulator"; + + enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>, + <&pioC 1 GPIO_ACTIVE_HIGH>; + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <7150000>; + regulator-always-on; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6500000>; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6500000>; + }; + }; + + backlight { + #address-cells = <1>; + #size-cells = <0>; + compatible = "ti,lm36274-backlight"; + + led@0 { + reg = <0>; + led-sources = <0 2>; + label = "white:backlight_cluster"; + linux,default-trigger = "backlight"; + }; + }; +}; -- cgit From 2076e5c0451ca943ff8ecc6def7239c84c77e070 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 6 May 2019 16:29:38 -0700 Subject: mm/hmm: update HMM documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the HMM documentation to reflect the latest API and make a few minor wording changes. Cc: John Hubbard Cc: Ira Weiny Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Cc: Andrew Morton Signed-off-by: Ralph Campbell Reviewed-by: Jérôme Glisse Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 141 +++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 67 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 7cdf7282e022..7b6eeda5a7c0 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of this document). HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., -allowing a device to transparently access program address coherently with +allowing a device to transparently access program addresses coherently with the CPU meaning that any valid pointer on the CPU is also a valid pointer for the device. This is becoming mandatory to simplify the use of advanced heterogeneous computing where GPU, DSP, or FPGA are used to perform various @@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third section gives an overview of the HMM design. The fourth section explains how CPU page-table mirroring works and the purpose of HMM in this context. The fifth section deals with how device memory is represented inside the kernel. -Finally, the last section presents a new migration helper that allows lever- -aging the device DMA engine. +Finally, the last section presents a new migration helper that allows +leveraging the device DMA engine. .. contents:: :local: @@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation: i.e., one in which any application memory region can be used by a device transparently. -Split address space happens because device can only access memory allocated -through device specific API. This implies that all memory objects in a program +Split address space happens because devices can only access memory allocated +through a device specific API. This implies that all memory objects in a program are not equal from the device point of view which complicates large programs that rely on a wide set of libraries. -Concretely this means that code that wants to leverage devices like GPUs needs -to copy object between generically allocated memory (malloc, mmap private, mmap +Concretely, this means that code that wants to leverage devices like GPUs needs +to copy objects between generically allocated memory (malloc, mmap private, mmap share) and memory allocated through the device driver API (this still ends up with an mmap but of the device file). For flat data sets (array, grid, image, ...) this isn't too hard to achieve but -complex data sets (list, tree, ...) are hard to get right. Duplicating a +for complex data sets (list, tree, ...) it's hard to get right. Duplicating a complex data set needs to re-map all the pointer relations between each of its -elements. This is error prone and program gets harder to debug because of the +elements. This is error prone and programs get harder to debug because of the duplicate data set and addresses. Split address space also means that libraries cannot transparently use data @@ -77,12 +77,12 @@ I/O bus, device memory characteristics I/O buses cripple shared address spaces due to a few limitations. Most I/O buses only allow basic memory access from device to main memory; even cache -coherency is often optional. Access to device memory from CPU is even more +coherency is often optional. Access to device memory from a CPU is even more limited. More often than not, it is not cache coherent. If we only consider the PCIE bus, then a device can access main memory (often through an IOMMU) and be cache coherent with the CPUs. However, it only allows -a limited set of atomic operations from device on main memory. This is worse +a limited set of atomic operations from the device on main memory. This is worse in the other direction: the CPU can only access a limited range of the device memory and cannot perform atomic operations on it. Thus device memory cannot be considered the same as regular memory from the kernel point of view. @@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an order of magnitude higher latency than when the device accesses its own memory. Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow two- -way cache coherency between CPU and device and allow all atomic operations the +to address some of these limitations (OpenCAPI, CCIX). They mainly allow +two-way cache coherency between CPU and device and allow all atomic operations the architecture supports. Sadly, not all platforms are following this trend and some major architectures are left without hardware solutions to these problems. So for shared address space to make sense, not only must we allow devices to access any memory but we must also permit any memory to be migrated to device -memory while device is using it (blocking CPU access while it happens). +memory while the device is using it (blocking CPU access while it happens). Shared address space and migration ================================== -HMM intends to provide two main features. First one is to share the address +HMM intends to provide two main features. The first one is to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space. @@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the hardware specific details to the device driver. The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that -allows allocating a struct page for each page of the device memory. Those pages +allows allocating a struct page for each page of device memory. Those pages are special because the CPU cannot map them. However, they allow migrating main memory to device memory using existing migration mechanisms and everything -looks like a page is swapped out to disk from the CPU point of view. Using a -struct page gives the easiest and cleanest integration with existing mm mech- -anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +looks like a page that is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm +mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE memory for the device memory and second to perform migration. Policy decisions -of what and when to migrate things is left to the device driver. +of what and when to migrate is left to the device driver. Note that any CPU access to a device page triggers a page fault and a migration back to main memory. For example, when a page backing a given CPU address A is @@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to address A triggers a page fault and initiates a migration back to main memory. With these two features, HMM not only allows a device to mirror process address -space and keeping both CPU and device page table synchronized, but also lever- -ages device memory by migrating the part of the data set that is actively being +space and keeps both CPU and device page tables synchronized, but also +leverages device memory by migrating the part of the data set that is actively being used by the device. @@ -151,21 +151,28 @@ registration of an hmm_mirror struct:: int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); - int hmm_mirror_register_locked(struct hmm_mirror *mirror, - struct mm_struct *mm); - -The locked variant is to be used when the driver is already holding mmap_sem -of the mm in write mode. The mirror struct has a set of callbacks that are used +The mirror struct has a set of callbacks that are used to propagate CPU page tables:: struct hmm_mirror_ops { + /* release() - release hmm_mirror + * + * @mirror: pointer to struct hmm_mirror + * + * This is called when the mm_struct is being released. The callback + * must ensure that all access to any pages obtained from this mirror + * is halted before the callback returns. All future access should + * fault. + */ + void (*release)(struct hmm_mirror *mirror); + /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update_type: type of update that occurred to the CPU page table - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if update.blockable false and callback need to + * block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -176,14 +183,12 @@ to propagate CPU page tables:: * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - void (*update)(struct hmm_mirror *mirror, - enum hmm_update action, - unsigned long start, - unsigned long end); + int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, + const struct hmm_update *update); }; The device driver must perform the update action to the range (mark range -read only, or fully unmap, ...). The device must be done with the update before +read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can @@ -194,17 +199,18 @@ use either:: The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entry if the -write parameter is true. Page faults use the generic mm page fault code path -just like a CPU page fault. +The second one does trigger a page fault on missing or read-only entries if +write access is requested (see below). Page faults use the generic mm page +fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each entry in that array corresponds to an address in the virtual range. HMM provides a set of flags to help the driver identify special CPU page table entries. -Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronized. The usage pattern is:: +Locking within the sync_cpu_device_pagetables() callback is the most important +aspect the driver must respect in order to keep things properly synchronized. +The usage pattern is:: int driver_populate_range(...) { @@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is:: hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); goto again; } - hmm_mirror_unregister(&range); + hmm_range_unregister(&range); return ret; } take_lock(driver->update); - if (!range.valid) { + if (!hmm_range_valid(&range)) { release_lock(driver->update); up_read(&mm->mmap_sem); goto again; @@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is:: // Use pfns array content to update device page table - hmm_mirror_unregister(&range); + hmm_range_unregister(&range); release_lock(driver->update); up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before checking the range.valid -field to avoid any race with a concurrent CPU page table update. +sync_cpu_device_pagetables() callback. That lock must be held before calling +hmm_range_valid() to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing @@ -279,46 +285,47 @@ concurrently). Leverage default_flags and pfn_flags_mask ========================================= -The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows -to set fault or snapshot policy for a whole range instead of having to set them -for each entries in the range. +The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify +fault or snapshot policy for the whole range instead of having to set them +for each entry in the pfns array. + +For instance, if the device flags for range.flags are:: -For instance if the device flags for device entries are: - VALID (1 << 63) - WRITE (1 << 62) + range.flags[HMM_PFN_VALID] = (1 << 63); + range.flags[HMM_PFN_WRITE] = (1 << 62); -Now let say that device driver wants to fault with at least read a range then -it does set:: +and the device driver wants pages for a range with at least read permission, +it sets:: range->default_flags = (1 << 63); range->pfn_flags_mask = 0; -and calls hmm_range_fault() as described above. This will fill fault all page +and calls hmm_range_fault() as described above. This will fill fault all pages in the range with at least read permission. -Now let say driver wants to do the same except for one page in the range for -which its want to have write. Now driver set:: +Now let's say the driver wants to do the same except for one page in the range for +which it wants to have write permission. Now driver set:: range->default_flags = (1 << 63); range->pfn_flags_mask = (1 << 62); range->pfns[index_of_write] = (1 << 62); -With this HMM will fault in all page with at least read (ie valid) and for the +With this, HMM will fault in all pages with at least read (i.e., valid) and for the address == range->start + (index_of_write << PAGE_SHIFT) it will fault with -write permission ie if the CPU pte does not have write permission set then HMM +write permission i.e., if the CPU pte does not have write permission set then HMM will call handle_mm_fault(). -Note that HMM will populate the pfns array with write permission for any entry -that have write permission within the CPU pte no matter what are the values set +Note that HMM will populate the pfns array with write permission for any page +that is mapped with CPU write permission no matter what values are set in default_flags or pfn_flags_mask. Represent and manage device memory from core kernel point of view ================================================================= -Several different designs were tried to support device memory. First one used -a device specific data structure to keep information about migrated memory and -HMM hooked itself in various places of mm code to handle any access to +Several different designs were tried to support device memory. The first one +used a device specific data structure to keep information about migrated memory +and HMM hooked itself in various places of mm code to handle any access to addresses that were backed by device memory. It turns out that this ended up replicating most of the fields of struct page and also needed many kernel code paths to be updated to understand this new kind of memory. @@ -341,7 +348,7 @@ The hmm_devmem_ops is where most of the important things are:: struct hmm_devmem_ops { void (*free)(struct hmm_devmem *devmem, struct page *page); - int (*fault)(struct hmm_devmem *devmem, + vm_fault_t (*fault)(struct hmm_devmem *devmem, struct vm_area_struct *vma, unsigned long addr, struct page *page, @@ -417,9 +424,9 @@ willing to pay to keep all the code simpler. Memory cgroup (memcg) and rss accounting ======================================== -For now device memory is accounted as any regular page in rss counters (either +For now, device memory is accounted as any regular page in rss counters (either anonymous if device page is used for anonymous, file if device page is used for -file backed page or shmem if device page is used for shared memory). This is a +file backed page, or shmem if device page is used for shared memory). This is a deliberate choice to keep existing applications, that might start using device memory without knowing about it, running unimpacted. @@ -439,6 +446,6 @@ get more experience in how device memory is used and its impact on memory resource control. -Note that device memory can never be pinned by device driver nor through GUP +Note that device memory can never be pinned by a device driver nor through GUP and thus such memory is always free upon process exit. Or when last reference is dropped in case of shared memory or file backed memory. -- cgit From b637e0856a6248230e53b5465ab0751f27fdf320 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Wed, 5 Jun 2019 10:05:50 -0500 Subject: dt-bindings: socfpga-dwmac: add "altr, socfpga-stmmac-a10-s10" binding Add the "altr,socfpga-stmmac-a10-s10" binding for Arria10/Agilex/Stratix10 implementation of the stmmac ethernet controller. On the Arria10, Agilex, and Stratix10 SoCs, there are a few differences from the Cyclone5 and Arria5: - The emac PHY setup bits are in separate registers. - The PTP reference clock select mask is different. - The register to enable the emac signal from FPGA is different. Because of these differences, the dwmac-socfpga glue logic driver will use this new binding to set the appropriate bits for PHY, PTP reference clock, and signal from FPGA. Signed-off-by: Dinh Nguyen Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/socfpga-dwmac.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt index 17d6819669c8..612a8e8abc88 100644 --- a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt +++ b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt @@ -6,11 +6,17 @@ present in Documentation/devicetree/bindings/net/stmmac.txt. The device node has additional properties: Required properties: - - compatible : Should contain "altr,socfpga-stmmac" along with - "snps,dwmac" and any applicable more detailed + - compatible : For Cyclone5/Arria5 SoCs it should contain + "altr,socfpga-stmmac". For Arria10/Agilex/Stratix10 SoCs + "altr,socfpga-stmmac-a10-s10". + Along with "snps,dwmac" and any applicable more detailed designware version numbers documented in stmmac.txt - altr,sysmgr-syscon : Should be the phandle to the system manager node that encompasses the glue register, the register offset, and the register shift. + On Cyclone5/Arria5, the register shift represents the PHY mode bits, while + on the Arria10/Stratix10/Agilex platforms, the register shift represents + bit for each emac to enable/disable signals from the FPGA fabric to the + EMAC modules. - altr,f2h_ptp_ref_clk use f2h_ptp_ref_clk instead of default eosc1 clock for ptp ref clk. This affects all emacs as the clock is common. -- cgit From 8447d84e35f2c3b2e76f80319d0b547a9521bafa Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 15 Jan 2019 12:19:53 +0000 Subject: dt-bindings: virtio-mmio: Add IOMMU description The nature of a virtio-mmio node is discovered by the virtio driver at probe time. However the DMA relation between devices must be described statically. When a virtio-mmio node is a virtio-iommu device, it needs an "#iommu-cells" property as specified by bindings/iommu/iommu.txt. Otherwise, the virtio-mmio device may perform DMA through an IOMMU, which requires an "iommus" property. Describe these requirements in the device-tree bindings documentation. Reviewed-by: Rob Herring Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin --- Documentation/devicetree/bindings/virtio/mmio.txt | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/virtio/mmio.txt b/Documentation/devicetree/bindings/virtio/mmio.txt index 5069c1b8e193..21af30fbb81f 100644 --- a/Documentation/devicetree/bindings/virtio/mmio.txt +++ b/Documentation/devicetree/bindings/virtio/mmio.txt @@ -8,10 +8,40 @@ Required properties: - reg: control registers base address and size including configuration space - interrupts: interrupt generated by the device +Required properties for virtio-iommu: + +- #iommu-cells: When the node corresponds to a virtio-iommu device, it is + linked to DMA masters using the "iommus" or "iommu-map" + properties [1][2]. #iommu-cells specifies the size of the + "iommus" property. For virtio-iommu #iommu-cells must be + 1, each cell describing a single endpoint ID. + +Optional properties: + +- iommus: If the device accesses memory through an IOMMU, it should + have an "iommus" property [1]. Since virtio-iommu itself + does not access memory through an IOMMU, the "virtio,mmio" + node cannot have both an "#iommu-cells" and an "iommus" + property. + Example: virtio_block@3000 { compatible = "virtio,mmio"; reg = <0x3000 0x100>; interrupts = <41>; + + /* Device has endpoint ID 23 */ + iommus = <&viommu 23> } + + viommu: iommu@3100 { + compatible = "virtio,mmio"; + reg = <0x3100 0x100>; + interrupts = <42>; + + #iommu-cells = <1> + } + +[1] Documentation/devicetree/bindings/iommu/iommu.txt +[2] Documentation/devicetree/bindings/pci/pci-iommu.txt -- cgit From 6c9e92ef8bdde13398caae1989b0292524d6475e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 15 Jan 2019 12:19:54 +0000 Subject: dt-bindings: virtio: Add virtio-pci-iommu node Some systems implement virtio-iommu as a PCI endpoint. The operating system needs to discover the relationship between IOMMU and masters long before the PCI endpoint gets probed. Add a PCI child node to describe the virtio-iommu device. The virtio-pci-iommu is conceptually split between a PCI programming interface and a translation component on the parent bus. The latter doesn't have a node in the device tree. The virtio-pci-iommu node describes both, by linking the PCI endpoint to "iommus" property of DMA master nodes and to "iommu-map" properties of bus nodes. Reviewed-by: Rob Herring Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin --- Documentation/devicetree/bindings/virtio/iommu.txt | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 Documentation/devicetree/bindings/virtio/iommu.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/virtio/iommu.txt b/Documentation/devicetree/bindings/virtio/iommu.txt new file mode 100644 index 000000000000..2407fea0651c --- /dev/null +++ b/Documentation/devicetree/bindings/virtio/iommu.txt @@ -0,0 +1,66 @@ +* virtio IOMMU PCI device + +When virtio-iommu uses the PCI transport, its programming interface is +discovered dynamically by the PCI probing infrastructure. However the +device tree statically describes the relation between IOMMU and DMA +masters. Therefore, the PCI root complex that hosts the virtio-iommu +contains a child node representing the IOMMU device explicitly. + +Required properties: + +- compatible: Should be "virtio,pci-iommu" +- reg: PCI address of the IOMMU. As defined in the PCI Bus + Binding reference [1], the reg property is a five-cell + address encoded as (phys.hi phys.mid phys.lo size.hi + size.lo). phys.hi should contain the device's BDF as + 0b00000000 bbbbbbbb dddddfff 00000000. The other cells + should be zero. +- #iommu-cells: Each platform DMA master managed by the IOMMU is assigned + an endpoint ID, described by the "iommus" property [2]. + For virtio-iommu, #iommu-cells must be 1. + +Notes: + +- DMA from the IOMMU device isn't managed by another IOMMU. Therefore the + virtio-iommu node doesn't have an "iommus" property, and is omitted from + the iommu-map property of the root complex. + +Example: + +pcie@10000000 { + compatible = "pci-host-ecam-generic"; + ... + + /* The IOMMU programming interface uses slot 00:01.0 */ + iommu0: iommu@0008 { + compatible = "virtio,pci-iommu"; + reg = <0x00000800 0 0 0 0>; + #iommu-cells = <1>; + }; + + /* + * The IOMMU manages all functions in this PCI domain except + * itself. Omit BDF 00:01.0. + */ + iommu-map = <0x0 &iommu0 0x0 0x8> + <0x9 &iommu0 0x9 0xfff7>; +}; + +pcie@20000000 { + compatible = "pci-host-ecam-generic"; + ... + /* + * The IOMMU also manages all functions from this domain, + * with endpoint IDs 0x10000 - 0x1ffff + */ + iommu-map = <0x0 &iommu0 0x10000 0x10000>; +}; + +ethernet@fe001000 { + ... + /* The IOMMU manages this platform device with endpoint ID 0x20000 */ + iommus = <&iommu0 0x20000>; +}; + +[1] Documentation/devicetree/bindings/pci/pci.txt +[2] Documentation/devicetree/bindings/iommu/iommu.txt -- cgit From 3d8b6e9c774ff57ad2860f79abd9cdb8f29ecaf8 Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Thu, 2 May 2019 14:18:42 +0200 Subject: dt-bindings: mediatek: audsys: add support for MT8516 Add AUDSYS device tree bindings documentation for MediaTek MT8516 SoC. Signed-off-by: Fabien Parent Reviewed-by: Rob Herring Signed-off-by: Stephen Boyd --- Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt index f3cef1a6d95c..07c9d813465c 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt @@ -10,6 +10,7 @@ Required Properties: - "mediatek,mt7622-audsys", "syscon" - "mediatek,mt7623-audsys", "mediatek,mt2701-audsys", "syscon" - "mediatek,mt8183-audiosys", "syscon" + - "mediatek,mt8516-audsys", "syscon" - #clock-cells: Must be 1 The AUDSYS controller uses the common clk binding from -- cgit From 09a0354cadec267be7f5249c89eb998b3474263a Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 6 Jun 2019 16:28:09 -0600 Subject: net: axienet: Use clock framework to get device clock rate This driver was previously always calculating the MDIO clock divisor (from AXI bus clock to MDIO bus clock) based on the CPU clock frequency, assuming that it is the same as the AXI bus frequency, but that simplistic method only works on the MicroBlaze platform. Add support for specifying the clock used for the device in the device tree using the clock framework. If the clock is specified then it will be used when calculating the clock divisor. The previous CPU clock detection method is left for backward compatibility if no clock is specified. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/xilinx_axienet.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt index 38f9ec076743..2dea90332ebb 100644 --- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -31,6 +31,11 @@ Optional properties: 1 to enable partial TX checksum offload, 2 to enable full TX checksum offload - xlnx,rxcsum : Same values as xlnx,txcsum but for RX checksum offload +- clocks : AXI bus clock for the device. Refer to common clock bindings. + Used to calculate MDIO clock divisor. If not specified, it is + auto-detected from the CPU clock (but only on platforms where + this is possible). New device trees should specify this - the + auto detection is only for backward compatibility. Example: axi_ethernet_eth: ethernet@40c00000 { @@ -38,6 +43,7 @@ Example: device_type = "network"; interrupt-parent = <µblaze_0_axi_intc>; interrupts = <2 0>; + clocks = <&axi_clk>; phy-mode = "mii"; reg = <0x40c00000 0x40000>; xlnx,rxcsum = <0x2>; -- cgit From 522856cefaf09d1a06ddc02535c7e1e81730c278 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 6 Jun 2019 16:28:16 -0600 Subject: net: axienet: Add optional support for Ethernet core interrupt Previously this driver only handled interrupts from the DMA RX and TX blocks, not from the Ethernet core itself. Add optional support for the Ethernet core interrupt, which is used to detect rx_missed and framing errors signalled by the hardware. In order to use this interrupt, a third interrupt needs to be specified in the device tree. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/xilinx_axienet.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt index 2dea90332ebb..0be335c7abed 100644 --- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -18,7 +18,8 @@ Required properties: - compatible : Must be one of "xlnx,axi-ethernet-1.00.a", "xlnx,axi-ethernet-1.01.a", "xlnx,axi-ethernet-2.01.a" - reg : Address and length of the IO space. -- interrupts : Should be a list of two interrupt, TX and RX. +- interrupts : Should be a list of 2 or 3 interrupts: TX DMA, RX DMA, + and optionally Ethernet core. - phy-handle : Should point to the external phy device. See ethernet.txt file in the same directory. - xlnx,rxmem : Set to allocated memory buffer for Rx/Tx in the hardware -- cgit From a4ebb2997c10f3e8b7b2de27c0f2cf3e4123ed09 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 6 Jun 2019 16:28:20 -0600 Subject: net: axienet: document device tree mdio child node The mdio child node for the MDIO bus is generally required when using this driver but was not documented other than being shown in the example. Document it as an optional (but usually required) parameter. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/xilinx_axienet.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt index 0be335c7abed..a8be67b425fd 100644 --- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -37,6 +37,9 @@ Optional properties: auto-detected from the CPU clock (but only on platforms where this is possible). New device trees should specify this - the auto detection is only for backward compatibility. + - mdio : Child node for MDIO bus. Must be defined if PHY access is + required through the core's MDIO interface (i.e. always, + unless the PHY is accessed through a different bus). Example: axi_ethernet_eth: ethernet@40c00000 { -- cgit From a1765c1850be90f621fb69751ccaa3370cf60571 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 6 Jun 2019 16:28:22 -0600 Subject: net: axienet: document axistream-connected attribute The axienet driver requires the use of an axistream-connected attribute, but this isn't documented in the devicetree bindings. Document how this attribute is supposed to be used, including the upcoming change to make the usage of this attribute optional. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- .../devicetree/bindings/net/xilinx_axienet.txt | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt index a8be67b425fd..7360617cdedb 100644 --- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt +++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt @@ -17,9 +17,15 @@ For more details about mdio please refer phy.txt file in the same directory. Required properties: - compatible : Must be one of "xlnx,axi-ethernet-1.00.a", "xlnx,axi-ethernet-1.01.a", "xlnx,axi-ethernet-2.01.a" -- reg : Address and length of the IO space. +- reg : Address and length of the IO space, as well as the address + and length of the AXI DMA controller IO space, unless + axistream-connected is specified, in which case the reg + attribute of the node referenced by it is used. - interrupts : Should be a list of 2 or 3 interrupts: TX DMA, RX DMA, - and optionally Ethernet core. + and optionally Ethernet core. If axistream-connected is + specified, the TX/RX DMA interrupts should be on that node + instead, and only the Ethernet core interrupt is optionally + specified here. - phy-handle : Should point to the external phy device. See ethernet.txt file in the same directory. - xlnx,rxmem : Set to allocated memory buffer for Rx/Tx in the hardware @@ -37,6 +43,11 @@ Optional properties: auto-detected from the CPU clock (but only on platforms where this is possible). New device trees should specify this - the auto detection is only for backward compatibility. +- axistream-connected: Reference to another node which contains the resources + for the AXI DMA controller used by this device. + If this is specified, the DMA-related resources from that + device (DMA registers and DMA TX/RX interrupts) rather + than this one will be used. - mdio : Child node for MDIO bus. Must be defined if PHY access is required through the core's MDIO interface (i.e. always, unless the PHY is accessed through a different bus). @@ -46,10 +57,10 @@ Example: compatible = "xlnx,axi-ethernet-1.00.a"; device_type = "network"; interrupt-parent = <µblaze_0_axi_intc>; - interrupts = <2 0>; + interrupts = <2 0 1>; clocks = <&axi_clk>; phy-mode = "mii"; - reg = <0x40c00000 0x40000>; + reg = <0x40c00000 0x40000 0x50c00000 0x40000>; xlnx,rxcsum = <0x2>; xlnx,rxmem = <0x800>; xlnx,txcsum = <0x2>; -- cgit From 072a551fd5cff2329ad151c7c25b7958afc0f149 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Tue, 28 May 2019 09:47:40 -0700 Subject: dt-bindings: clock: Document gpucc for msm8998 The GPU for msm8998 has its own clock controller. Document it. Signed-off-by: Jeffrey Hugo Signed-off-by: Stephen Boyd --- Documentation/devicetree/bindings/clock/qcom,gpucc.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt index 4e5215ef1acd..269afe8a757e 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt +++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt @@ -2,13 +2,15 @@ Qualcomm Graphics Clock & Reset Controller Binding -------------------------------------------------- Required properties : -- compatible : shall contain "qcom,sdm845-gpucc" +- compatible : shall contain "qcom,sdm845-gpucc" or "qcom,msm8998-gpucc" - reg : shall contain base register location and length - #clock-cells : from common clock binding, shall contain 1 - #reset-cells : from common reset binding, shall contain 1 - #power-domain-cells : from generic power domain binding, shall contain 1 - clocks : shall contain the XO clock + shall contain the gpll0 out main clock (msm8998) - clock-names : shall be "xo" + shall be "gpll0" (msm8998) Example: gpucc: clock-controller@5090000 { -- cgit From df6bd6c002a4dee73d274a87ef1c0e36f21d2583 Mon Sep 17 00:00:00 2001 From: Ludovic Barre Date: Tue, 7 May 2019 11:16:38 +0200 Subject: mtd: spi-nor: stm32: remove the driver as it was replaced by spi-stm32-qspi.c There's a new driver using the SPI memory interface of the SPI framework at spi/spi-stm32-qspi.c, which can be used together with m25p80.c to replace the functionality of this SPI NOR driver. The "new" driver uses the same dt properties and not affects the legacy compatibility. Signed-off-by: Ludovic Barre Signed-off-by: Tudor Ambarus --- .../devicetree/bindings/mtd/stm32-quadspi.txt | 43 ---------------------- 1 file changed, 43 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mtd/stm32-quadspi.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt deleted file mode 100644 index ddd18c135148..000000000000 --- a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt +++ /dev/null @@ -1,43 +0,0 @@ -* STMicroelectronics Quad Serial Peripheral Interface(QuadSPI) - -Required properties: -- compatible: should be "st,stm32f469-qspi" -- reg: the first contains the register location and length. - the second contains the memory mapping address and length -- reg-names: should contain the reg names "qspi" "qspi_mm" -- interrupts: should contain the interrupt for the device -- clocks: the phandle of the clock needed by the QSPI controller -- A pinctrl must be defined to set pins in mode of operation for QSPI transfer - -Optional properties: -- resets: must contain the phandle to the reset controller. - -A spi flash must be a child of the nor_flash node and could have some -properties. Also see jedec,spi-nor.txt. - -Required properties: -- reg: chip-Select number (QSPI controller may connect 2 nor flashes) -- spi-max-frequency: max frequency of spi bus - -Optional property: -- spi-rx-bus-width: see ../spi/spi-bus.txt for the description - -Example: - -qspi: spi@a0001000 { - compatible = "st,stm32f469-qspi"; - reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>; - reg-names = "qspi", "qspi_mm"; - interrupts = <91>; - resets = <&rcc STM32F4_AHB3_RESET(QSPI)>; - clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_qspi0>; - - flash@0 { - reg = <0>; - spi-rx-bus-width = <4>; - spi-max-frequency = <108000000>; - ... - }; -}; -- cgit From fb6dda8349ea0a6a68a743b6cb636261f4489983 Mon Sep 17 00:00:00 2001 From: Long Cheng Date: Sat, 27 Apr 2019 11:36:32 +0800 Subject: dt-bindings: dma: uart: rename binding The filename matches mtk-uart-apdma.c. So using "mtk-uart-apdma.txt" should be better. And add some property. Signed-off-by: Long Cheng Reviewed-by: Rob Herring Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/8250_mtk_dma.txt | 33 ------------- .../devicetree/bindings/dma/mtk-uart-apdma.txt | 54 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 33 deletions(-) delete mode 100644 Documentation/devicetree/bindings/dma/8250_mtk_dma.txt create mode 100644 Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt b/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt deleted file mode 100644 index 3fe0961bcf64..000000000000 --- a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt +++ /dev/null @@ -1,33 +0,0 @@ -* Mediatek UART APDMA Controller - -Required properties: -- compatible should contain: - * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA - * "mediatek,mt6577-uart-dma" for MT6577 and all of the above - -- reg: The base address of the APDMA register bank. - -- interrupts: A single interrupt specifier. - -- clocks : Must contain an entry for each entry in clock-names. - See ../clocks/clock-bindings.txt for details. -- clock-names: The APDMA clock for register accesses - -Examples: - - apdma: dma-controller@11000380 { - compatible = "mediatek,mt2712-uart-dma"; - reg = <0 0x11000380 0 0x400>; - interrupts = , - , - , - , - , - , - , - ; - clocks = <&pericfg CLK_PERI_AP_DMA>; - clock-names = "apdma"; - #dma-cells = <1>; - }; - diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt new file mode 100644 index 000000000000..5d6f98c43e3d --- /dev/null +++ b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt @@ -0,0 +1,54 @@ +* Mediatek UART APDMA Controller + +Required properties: +- compatible should contain: + * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA + * "mediatek,mt6577-uart-dma" for MT6577 and all of the above + +- reg: The base address of the APDMA register bank. + +- interrupts: A single interrupt specifier. + One interrupt per dma-requests, or 8 if no dma-requests property is present + +- dma-requests: The number of DMA channels + +- clocks : Must contain an entry for each entry in clock-names. + See ../clocks/clock-bindings.txt for details. +- clock-names: The APDMA clock for register accesses + +- mediatek,dma-33bits: Present if the DMA requires support + +Examples: + + apdma: dma-controller@11000400 { + compatible = "mediatek,mt2712-uart-dma"; + reg = <0 0x11000400 0 0x80>, + <0 0x11000480 0 0x80>, + <0 0x11000500 0 0x80>, + <0 0x11000580 0 0x80>, + <0 0x11000600 0 0x80>, + <0 0x11000680 0 0x80>, + <0 0x11000700 0 0x80>, + <0 0x11000780 0 0x80>, + <0 0x11000800 0 0x80>, + <0 0x11000880 0 0x80>, + <0 0x11000900 0 0x80>, + <0 0x11000980 0 0x80>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + ; + dma-requests = <12>; + clocks = <&pericfg CLK_PERI_AP_DMA>; + clock-names = "apdma"; + mediatek,dma-33bits; + #dma-cells = <1>; + }; -- cgit From 99d2b938672944831035bef50c68a6e948e93abf Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 7 Jun 2019 16:47:13 +0900 Subject: Documentation: DMA-API: fix a function name of max_mapping_size The exported function name is dma_max_mapping_size(), not dma_direct_max_mapping_size() so that this patch fixes the function name in the documentation. Fixes: 133d624b1cee ("dma: Introduce dma_max_mapping_size()") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Jonathan Corbet --- Documentation/DMA-API.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 0076150fdccb..e47c63bd4887 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -198,7 +198,7 @@ call to set the mask to the value returned. :: size_t - dma_direct_max_mapping_size(struct device *dev); + dma_max_mapping_size(struct device *dev); Returns the maximum size of a mapping for the device. The size parameter of the mapping functions like dma_map_single(), dma_map_page() and -- cgit From 4241d516b0041ae55092fb12739e12184427de5d Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Tue, 4 Jun 2019 15:27:19 -0300 Subject: Documentation/dm-init: fix multi device example The example in the docs regarding multiple device-mappers is invalid (it has a wrong number of arguments), it's a left over from previous versions of the patch. Replace the example with an valid and tested one. Signed-off-by: Helen Koike Reviewed-by: Stephen Boyd Signed-off-by: Jonathan Corbet --- Documentation/device-mapper/dm-init.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt index 8464ee7c01b8..130b3c3679c5 100644 --- a/Documentation/device-mapper/dm-init.txt +++ b/Documentation/device-mapper/dm-init.txt @@ -74,13 +74,13 @@ this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here split on multiple lines for readability: - vroot,,,ro, - 0 1740800 verity 254:0 254:0 1740800 sha1 - 76e9be054b15884a9fa85973e9cb274c93afadb6 - 5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe; - vram,,,rw, - 0 32768 linear 1:0 0, - 32768 32768 linear 1:1 0 + dm-linear,,1,rw, + 0 32768 linear 8:1 0, + 32768 1024000 linear 8:2 0; + dm-verity,,3,ro, + 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 + ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 + 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 Other examples (per target): -- cgit From e0cef9ff6315d48a4dfd39da09ca770e242f9cb5 Mon Sep 17 00:00:00 2001 From: Aurelien Thierry Date: Fri, 7 Jun 2019 10:07:02 +0200 Subject: Documentation: fix typo CLOCK_MONONOTNIC_COARSE Fix typo in documentation file timekeeping.rst: CLOCK_MONONOTNIC_COARSE should be CLOCK_MONOTONIC_COARSE. Signed-off-by: Aurelien Thierry Signed-off-by: Jonathan Corbet --- Documentation/core-api/timekeeping.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst index 93cbeb9daec0..5f87d9c8b04d 100644 --- a/Documentation/core-api/timekeeping.rst +++ b/Documentation/core-api/timekeeping.rst @@ -111,7 +111,7 @@ Some additional variants exist for more specialized cases: void ktime_get_coarse_raw_ts64( struct timespec64 * ) These are quicker than the non-coarse versions, but less accurate, - corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE + corresponding to CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE in user space, along with the equivalent boottime/tai/raw timebase not available in user space. -- cgit From e47cf0c958775700c74223a1f21a8b3457c57069 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 7 Jun 2019 13:07:29 +0200 Subject: Documentation: tee: Grammar s/the its/its/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Jonathan Corbet --- Documentation/tee.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/tee.txt b/Documentation/tee.txt index 56ea85ffebf2..afacdf2fd1de 100644 --- a/Documentation/tee.txt +++ b/Documentation/tee.txt @@ -32,7 +32,7 @@ User space (the client) connects to the driver by opening /dev/tee[0-9]* or memory. - TEE_IOC_VERSION lets user space know which TEE this driver handles and - the its capabilities. + its capabilities. - TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application. -- cgit From 6fb44c439eda692f94cf60aad55f130a34204ece Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 7 Jun 2019 13:08:42 +0200 Subject: Documentation: net: dsa: Grammar s/the its/its/ Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: Jonathan Corbet --- Documentation/networking/dsa/dsa.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index ca87068b9ab9..563d56c6a25c 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -531,7 +531,7 @@ Bridge VLAN filtering a software implementation. .. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be the its port-based VLAN, used by the associated bridge device. + of DSA, would be its port-based VLAN, used by the associated bridge device. - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a Forwarding Database entry, the switch hardware should be programmed to delete @@ -554,7 +554,7 @@ Bridge VLAN filtering associated with this VLAN ID. .. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be the its port-based VLAN, used by the associated bridge device. + of DSA, would be its port-based VLAN, used by the associated bridge device. - ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a multicast database entry, the switch hardware should be programmed to delete -- cgit From 3f9564e680efb2092dfb826e2f768920c9eb203b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 7 Jun 2019 13:29:51 +0200 Subject: KVM: arm/arm64: Always capitalize ITS All but one reference is capitalized. Fix the remaining one. Signed-off-by: Geert Uytterhoeven Signed-off-by: Jonathan Corbet --- Documentation/virtual/kvm/devices/arm-vgic-its.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt index 4f0c9fc40365..eeaa95b893a8 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt @@ -103,7 +103,7 @@ Groups: The following ordering must be followed when restoring the GIC and the ITS: a) restore all guest memory and create vcpus b) restore all redistributors -c) provide the its base address +c) provide the ITS base address (KVM_DEV_ARM_VGIC_GRP_ADDR) d) restore the ITS in the following order: 1. Restore GITS_CBASER -- cgit From b1663d7e3a7961fc45262fd68a89253f2803036c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 4 Jun 2019 09:26:27 -0300 Subject: docs: Kbuild/Makefile: allow check for missing docs at build time While this doesn't make sense for production Kernels, in order to avoid regressions when documents are touched, let's add a check target at the make file. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/Kconfig | 13 +++++++++++++ Documentation/Makefile | 5 +++++ 2 files changed, 18 insertions(+) create mode 100644 Documentation/Kconfig (limited to 'Documentation') diff --git a/Documentation/Kconfig b/Documentation/Kconfig new file mode 100644 index 000000000000..66046fa1c341 --- /dev/null +++ b/Documentation/Kconfig @@ -0,0 +1,13 @@ +config WARN_MISSING_DOCUMENTS + + bool "Warn if there's a missing documentation file" + depends on COMPILE_TEST + help + It is not uncommon that a document gets renamed. + This option makes the Kernel to check for missing dependencies, + warning when something is missing. Works only if the Kernel + is built from a git tree. + + If unsure, select 'N'. + + diff --git a/Documentation/Makefile b/Documentation/Makefile index 2df0789f90b7..e145e4db508b 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -4,6 +4,11 @@ subdir-y := devicetree/bindings/ +# Check for broken documentation file references +ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y) +$(shell $(srctree)/scripts/documentation-file-ref-check --warn) +endif + # You can set these variables from the command line. SPHINXBUILD = sphinx-build SPHINXOPTS = -- cgit From 889aa9ca930602a0e860cfb89e467c2a7a729b1b Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Fri, 31 May 2019 16:30:16 +0200 Subject: docs: clk: fix struct syntax The clk_foo_ops struct example has syntax errors. Fix it so it can be copy-pasted and used more easily. Signed-off-by: Luca Ceresoli Signed-off-by: Jonathan Corbet --- Documentation/driver-api/clk.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst index 593cca5058b1..3cad45d14187 100644 --- a/Documentation/driver-api/clk.rst +++ b/Documentation/driver-api/clk.rst @@ -175,9 +175,9 @@ the following:: To take advantage of your data you'll need to support valid operations for your clk:: - struct clk_ops clk_foo_ops { - .enable = &clk_foo_enable; - .disable = &clk_foo_disable; + struct clk_ops clk_foo_ops = { + .enable = &clk_foo_enable, + .disable = &clk_foo_disable, }; Implement the above functions using container_of:: -- cgit From 165915c17d681c61962251728d72ecdabe95518e Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Thu, 30 May 2019 22:14:54 +0200 Subject: doc:it_IT: fix file references Fix italian translation file references based on `scripts/documentation-file-ref-check` output. Signed-off-by: Federico Vaga Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- .../translations/it_IT/admin-guide/kernel-parameters.rst | 12 ++++++++++++ Documentation/translations/it_IT/process/adding-syscalls.rst | 2 +- Documentation/translations/it_IT/process/coding-style.rst | 2 +- Documentation/translations/it_IT/process/howto.rst | 2 +- Documentation/translations/it_IT/process/magic-number.rst | 2 +- .../translations/it_IT/process/stable-kernel-rules.rst | 4 ++-- 6 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 Documentation/translations/it_IT/admin-guide/kernel-parameters.rst (limited to 'Documentation') diff --git a/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst b/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst new file mode 100644 index 000000000000..0e36d82a92be --- /dev/null +++ b/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst @@ -0,0 +1,12 @@ +.. include:: ../disclaimer-ita.rst + +:Original: :ref:`Documentation/admin-guide/kernel-parameters.rst ` + +.. _it_kernelparameters: + +I parametri da linea di comando del kernel +========================================== + +.. warning:: + + TODO ancora da tradurre diff --git a/Documentation/translations/it_IT/process/adding-syscalls.rst b/Documentation/translations/it_IT/process/adding-syscalls.rst index e0a64b0688a7..c3a3439595a6 100644 --- a/Documentation/translations/it_IT/process/adding-syscalls.rst +++ b/Documentation/translations/it_IT/process/adding-syscalls.rst @@ -39,7 +39,7 @@ vostra interfaccia. un qualche modo opaca. - Se dovete esporre solo delle informazioni sul sistema, un nuovo nodo in - sysfs (vedere ``Documentation/translations/it_IT/filesystems/sysfs.txt``) o + sysfs (vedere ``Documentation/filesystems/sysfs.txt``) o in procfs potrebbe essere sufficiente. Tuttavia, l'accesso a questi meccanismi richiede che il filesystem sia montato, il che potrebbe non essere sempre vero (per esempio, in ambienti come namespace/sandbox/chroot). diff --git a/Documentation/translations/it_IT/process/coding-style.rst b/Documentation/translations/it_IT/process/coding-style.rst index 5ef534c95e69..a6559d25a23d 100644 --- a/Documentation/translations/it_IT/process/coding-style.rst +++ b/Documentation/translations/it_IT/process/coding-style.rst @@ -696,7 +696,7 @@ nella stringa di titolo:: ... Per la documentazione completa sui file di configurazione, consultate -il documento Documentation/translations/it_IT/kbuild/kconfig-language.txt +il documento Documentation/kbuild/kconfig-language.txt 11) Strutture dati diff --git a/Documentation/translations/it_IT/process/howto.rst b/Documentation/translations/it_IT/process/howto.rst index 9903ac7c566b..44e6077730e8 100644 --- a/Documentation/translations/it_IT/process/howto.rst +++ b/Documentation/translations/it_IT/process/howto.rst @@ -131,7 +131,7 @@ Di seguito una lista di file che sono presenti nei sorgente del kernel e che "Linux kernel patch submission format" http://linux.yyz.us/patch-format.html - :ref:`Documentation/process/translations/it_IT/stable-api-nonsense.rst ` + :ref:`Documentation/translations/it_IT/process/stable-api-nonsense.rst ` Questo file descrive la motivazioni sottostanti la conscia decisione di non avere un API stabile all'interno del kernel, incluso cose come: diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst index 5281d53e57ee..ed1121d0ba84 100644 --- a/Documentation/translations/it_IT/process/magic-number.rst +++ b/Documentation/translations/it_IT/process/magic-number.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/magic-numbers.rst ` +:Original: :ref:`Documentation/process/magic-number.rst ` :Translator: Federico Vaga .. _it_magicnumbers: diff --git a/Documentation/translations/it_IT/process/stable-kernel-rules.rst b/Documentation/translations/it_IT/process/stable-kernel-rules.rst index 48e88e5ad2c5..4f206cee31a7 100644 --- a/Documentation/translations/it_IT/process/stable-kernel-rules.rst +++ b/Documentation/translations/it_IT/process/stable-kernel-rules.rst @@ -33,7 +33,7 @@ Regole sul tipo di patch che vengono o non vengono accettate nei sorgenti - Non deve includere alcuna correzione "banale" (correzioni grammaticali, pulizia dagli spazi bianchi, eccetera). - Deve rispettare le regole scritte in - :ref:`Documentation/translation/it_IT/process/submitting-patches.rst ` + :ref:`Documentation/translations/it_IT/process/submitting-patches.rst ` - Questa patch o una equivalente deve esistere già nei sorgenti principali di Linux @@ -43,7 +43,7 @@ Procedura per sottomettere patch per i sorgenti -stable - Se la patch contiene modifiche a dei file nelle cartelle net/ o drivers/net, allora seguite le linee guida descritte in - :ref:`Documentation/translation/it_IT/networking/netdev-FAQ.rst `; + :ref:`Documentation/translations/it_IT/networking/netdev-FAQ.rst `; ma solo dopo aver verificato al seguente indirizzo che la patch non sia già in coda: https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= -- cgit From bed0918d64ca28169d55bd138ed20f09e288303e Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Thu, 30 May 2019 22:14:55 +0200 Subject: doc:it_IT: documentation alignment Documentation alignment for the following changes: a700767a7682 (doc/docs-next) docs: requirements.txt: recommend Sphinx 1.7.9 Signed-off-by: Federico Vaga Signed-off-by: Jonathan Corbet --- Documentation/translations/it_IT/doc-guide/sphinx.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/translations/it_IT/doc-guide/sphinx.rst b/Documentation/translations/it_IT/doc-guide/sphinx.rst index 793b5cc33403..1739cba8863e 100644 --- a/Documentation/translations/it_IT/doc-guide/sphinx.rst +++ b/Documentation/translations/it_IT/doc-guide/sphinx.rst @@ -35,8 +35,7 @@ Installazione Sphinx ==================== I marcatori ReST utilizzati nei file in Documentation/ sono pensati per essere -processati da ``Sphinx`` nella versione 1.3 o superiore. Se desiderate produrre -un documento PDF è raccomandato l'utilizzo di una versione superiore alle 1.4.6. +processati da ``Sphinx`` nella versione 1.3 o superiore. Esiste uno script che verifica i requisiti Sphinx. Per ulteriori dettagli consultate :ref:`it_sphinx-pre-install`. @@ -68,13 +67,13 @@ pacchettizzato dalla vostra distribuzione. utilizzando LaTeX. Per una corretta interpretazione, è necessario aver installato texlive con i pacchetti amdfonts e amsmath. -Riassumendo, se volete installare la versione 1.4.9 di Sphinx dovete eseguire:: +Riassumendo, se volete installare la versione 1.7.9 di Sphinx dovete eseguire:: - $ virtualenv sphinx_1.4 - $ . sphinx_1.4/bin/activate - (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_1.7.9 + $ . sphinx_1.7.9/bin/activate + (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt -Dopo aver eseguito ``. sphinx_1.4/bin/activate``, il prompt cambierà per +Dopo aver eseguito ``. sphinx_1.7.9/bin/activate``, il prompt cambierà per indicare che state usando il nuovo ambiente. Se aprite un nuova sessione, prima di generare la documentazione, dovrete rieseguire questo comando per rientrare nell'ambiente virtuale. @@ -120,8 +119,8 @@ l'installazione:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.4 - . sphinx_1.4/bin/activate + /usr/bin/virtualenv sphinx_1.7.9 + . sphinx_1.7.9/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. -- cgit From 3d9cf48b2ca257f1a249b347236098c3cf9d54f1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 9 May 2019 15:40:49 +0800 Subject: Documentation: nvdimm: Fix typo Remove the extra 'we '. Signed-off-by: Shiyang Ruan Signed-off-by: Jonathan Corbet --- Documentation/nvdimm/nvdimm.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt index e894de69915a..1669f626b037 100644 --- a/Documentation/nvdimm/nvdimm.txt +++ b/Documentation/nvdimm/nvdimm.txt @@ -284,8 +284,8 @@ A bus has a 1:1 relationship with an NFIT. The current expectation for ACPI based systems is that there is only ever one platform-global NFIT. That said, it is trivial to register multiple NFITs, the specification does not preclude it. The infrastructure supports multiple busses and -we we use this capability to test multiple NFIT configurations in the -unit test. +we use this capability to test multiple NFIT configurations in the unit +test. LIBNVDIMM: control class device in /sys/class -- cgit From 9d61944356590c40b13f6b1f99df84260e4db0c1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 9 May 2019 11:05:49 +0800 Subject: Documentation: xfs: Fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In "Y+P" of this line, there are two non-ASCII characters(0xd9 0x8d) following behind the 'Y'. Shown as a small '=' under the '+' in VIM and a '賺' in webpage[1]. I think it's a mistake and remove these strange characters. [1]: https://www.kernel.org/doc/Documentation/filesystems/xfs-delayed-logging-design.txt Signed-off-by: Shiyang Ruan Signed-off-by: Jonathan Corbet --- Documentation/filesystems/xfs-delayed-logging-design.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt index 2ce36439c09f..9a6dd289b17b 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt @@ -34,7 +34,7 @@ transaction: D A+B+C+D X+n+m+o E E Y (> X+n+m+o) - F E+F Yٍ+p + F E+F Y+p In other words, each time an object is relogged, the new transaction contains the aggregation of all the previous changes currently held only in the log. -- cgit From 462e5a521ab73f7762583add73cbab1662612beb Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Wed, 5 Jun 2019 16:30:10 -0400 Subject: treewide: trivial: fix s/poped/popped/ typo Fix a couple of s/poped/popped/ typos. Signed-off-by: George G. Davis Acked-by: Steven Rostedt (VMware) Acked-by: Masami Hiramatsu Signed-off-by: Jonathan Corbet --- Documentation/arm/mem_alignment | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment index 6335fcacbba9..e110e2781039 100644 --- a/Documentation/arm/mem_alignment +++ b/Documentation/arm/mem_alignment @@ -1,4 +1,4 @@ -Too many problems poped up because of unnoticed misaligned memory access in +Too many problems popped up because of unnoticed misaligned memory access in kernel code lately. Therefore the alignment fixup is now unconditionally configured in for SA11x0 based targets. According to Alan Cox, this is a bad idea to configure it out, but Russell King has some good reasons for -- cgit From 78a89463a31ce463a4b968553f57ff9932a0697f Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Thu, 9 May 2019 18:31:16 +0800 Subject: Documentation: {u,k}probes: add tracing_on before tracing After following the document step by step, the `cat trace` can't be worked without enabling tracing_on and might mislead newbies about the functionality. Signed-off-by: Lecopzer Chen Acked-by: Masami Hiramatsu Signed-off-by: Jonathan Corbet --- Documentation/trace/kprobetrace.rst | 6 ++++++ Documentation/trace/uprobetracer.rst | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 235ce2ab131a..baa3c42ba2f4 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -189,6 +189,12 @@ events, you need to enable it. echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable +Use the following command to start tracing in an interval. +:: + # echo 1 > tracing_on + Open something... + # echo 0 > tracing_on + And you can see the traced information via /sys/kernel/debug/tracing/trace. :: diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index 4346e23e3ae7..0b21305fabdc 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -152,10 +152,15 @@ events, you need to enable it by:: # echo 1 > events/uprobes/enable -Lets disable the event after sleeping for some time. +Lets start tracing, sleep for some time and stop tracing. :: + # echo 1 > tracing_on # sleep 20 + # echo 0 > tracing_on + +Also, you can disable the event by:: + # echo 0 > events/uprobes/enable And you can see the traced information via /sys/kernel/debug/tracing/trace. -- cgit From 04f4dc1bc5f0f64640026497f97e2f3c063f7831 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 5 Jun 2019 07:56:33 -0500 Subject: dt-bindings: leds: Add LED bindings for the LM36274 Add the LM36274 LED specific bindings. Signed-off-by: Dan Murphy Reviewed-by: Rob Herring Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/leds-lm36274.txt | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 Documentation/devicetree/bindings/leds/leds-lm36274.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/leds/leds-lm36274.txt b/Documentation/devicetree/bindings/leds/leds-lm36274.txt new file mode 100644 index 000000000000..456a589c65f0 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-lm36274.txt @@ -0,0 +1,85 @@ +* Texas Instruments LM36274 4-Channel LCD Backlight Driver w/Integrated Bias + +The LM36274 is an integrated four-channel WLED driver and LCD bias supply. +The backlight boost provides the power to bias four parallel LED strings with +up to 29V total output voltage. The 11-bit LED current is programmable via +the I2C bus and/or controlled via a logic level PWM input from 60 uA to 30 mA. + +Parent device properties are documented in +Documentation/devicetree/bindings/mfd/ti_lmu.txt + +Regulator properties are documented in +Documentation/devicetree/bindings/regulator/lm363x-regulator.txt + +Required backlight properties: + - compatible: + "ti,lm36274-backlight" + - reg : 0 + - #address-cells : 1 + - #size-cells : 0 + - led-sources : Indicates which LED strings will be enabled. + Values from 0-3, sources is 0 based so strings will be + source value + 1. + +Optional backlight properties: + - label : see Documentation/devicetree/bindings/leds/common.txt + - linux,default-trigger : + see Documentation/devicetree/bindings/leds/common.txt + +Example: + +HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is +controlled by control bank B. + +lm36274@11 { + compatible = "ti,lm36274"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x11>; + + enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>; + + regulators { + #address-cells = <1>; + #size-cells = <0>; + compatible = "ti,lm363x-regulator"; + + enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>, + <&pioC 1 GPIO_ACTIVE_HIGH>; + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <7150000>; + regulator-always-on; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6500000>; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6500000>; + }; + }; + + backlight { + #address-cells = <1>; + #size-cells = <0>; + compatible = "ti,lm36274-backlight"; + + led@0 { + reg = <0>; + led-sources = <0 2>; + label = "white:backlight_cluster"; + linux,default-trigger = "backlight"; + }; + }; +}; + +For more product information please see the link below: +http://www.ti.com/lit/ds/symlink/lm36274.pdf -- cgit From 6c0215f5d9f2a1fa5cab2ca320a41d9f19cfa80c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:33 -0300 Subject: ASoC: dt-bindings: fix some broken links from txt->yaml conversion Some new files got converted to yaml, but references weren't updated accordingly. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/media/st,stm32-dcmi.txt | 2 +- Documentation/devicetree/bindings/sound/st,stm32-i2s.txt | 2 +- Documentation/devicetree/bindings/sound/st,stm32-sai.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt index 249790a93017..3122ded82eb4 100644 --- a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt +++ b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt @@ -11,7 +11,7 @@ Required properties: - clock-names: must contain "mclk", which is the DCMI peripherial clock - pinctrl: the pincontrol settings to configure muxing properly for pins that connect to DCMI device. - See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt. + See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml. - dmas: phandle to DMA controller node, see Documentation/devicetree/bindings/dma/stm32-dma.txt - dma-names: must contain "tx", which is the transmit channel from DCMI to DMA diff --git a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt index 58c341300552..cbf24bcd1b8d 100644 --- a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt +++ b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt @@ -18,7 +18,7 @@ Required properties: See Documentation/devicetree/bindings/dma/stm32-dma.txt. - dma-names: Identifier for each DMA request line. Must be "tx" and "rx". - pinctrl-names: should contain only value "default" - - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt + - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml Optional properties: - resets: Reference to a reset controller asserting the reset controller diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt index 3f4467ff0aa2..944743dd9212 100644 --- a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt +++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt @@ -41,7 +41,7 @@ SAI subnodes required properties: "tx": if sai sub-block is configured as playback DAI "rx": if sai sub-block is configured as capture DAI - pinctrl-names: should contain only value "default" - - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt + - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml SAI subnodes Optional properties: - st,sync: specify synchronization mode. -- cgit From 0aa3ebffc43cb8974f2fca92d07b9ebeba0f67c1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 May 2019 20:23:47 -0300 Subject: docs: gpio: driver.rst: fix a bad tag With ReST, [foo]_ means a reference to foo, causing this warning: Documentation/driver-api/gpio/driver.rst:419: WARNING: Unknown target name: "devm". Fix it by using a literal for the name. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Linus Walleij --- Documentation/driver-api/gpio/driver.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index 58036c2d84d2..4af9aae724f0 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -418,7 +418,7 @@ symbol: If there is a need to exclude certain GPIO lines from the IRQ domain handled by these helpers, we can set .irq.need_valid_mask of the gpiochip before -[devm_]gpiochip_add_data() is called. This allocates an .irq.valid_mask with as +``[devm_]gpiochip_add_data()`` is called. This allocates an .irq.valid_mask with as many bits set as there are GPIO lines in the chip, each bit representing line 0..n-1. Drivers can exclude GPIO lines by clearing bits from this mask. The mask must be filled in before gpiochip_irqchip_add() or gpiochip_irqchip_add_nested() -- cgit From 0df82dcd55832a99363ab7f9fab954fcacdac3ae Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Tue, 7 May 2019 11:34:37 +0200 Subject: dt-bindings: can: mcp251x: add mcp25625 support Fully compatible with mcp2515, the mcp25625 have integrated transceiver. This patch add the mcp25625 to the device tree bindings documentation. Signed-off-by: Sean Nyekjaer Signed-off-by: Marc Kleine-Budde --- Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt index 188c8bd4eb67..5a0111d4de58 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt +++ b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt @@ -4,6 +4,7 @@ Required properties: - compatible: Should be one of the following: - "microchip,mcp2510" for MCP2510. - "microchip,mcp2515" for MCP2515. + - "microchip,mcp25625" for MCP25625. - reg: SPI chip select. - clocks: The clock feeding the CAN controller. - interrupts: Should contain IRQ line for the CAN controller. -- cgit From fc8938d445d5ca861c5a76098512ab6dd67f6d80 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Thu, 6 Jun 2019 15:26:17 +0530 Subject: dt-bindings: gpio: davinci: Add k3 am654 compatible The patch adds k3 am654 compatible, specific properties and an example. Signed-off-by: Keerthy Signed-off-by: Linus Walleij --- .../devicetree/bindings/gpio/gpio-davinci.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt index 553b92a7e87b..bc6b4b62df83 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt @@ -5,6 +5,7 @@ Required Properties: "ti,keystone-gpio": for Keystone 2 66AK2H/K, 66AK2L, 66AK2E SoCs "ti,k2g-gpio", "ti,keystone-gpio": for 66AK2G + "ti,am654-gpio", "ti,keystone-gpio": for TI K3 AM654 - reg: Physical base address of the controller and the size of memory mapped registers. @@ -145,3 +146,20 @@ gpio0: gpio@260bf00 { ti,ngpio = <32>; ti,davinci-gpio-unbanked = <32>; }; + +Example for K3 AM654: + +wkup_gpio0: wkup_gpio0@42110000 { + compatible = "ti,am654-gpio", "ti,keystone-gpio"; + reg = <0x42110000 0x100>; + gpio-controller; + #gpio-cells = <2>; + interrupt-parent = <&intr_wkup_gpio>; + interrupts = <59 128>, <59 129>, <59 130>, <59 131>; + interrupt-controller; + #interrupt-cells = <2>; + ti,ngpio = <56>; + ti,davinci-gpio-unbanked = <0>; + clocks = <&k3_clks 59 0>; + clock-names = "gpio"; +}; -- cgit From 3a11cf2217520e3c827a24c6d5f648fa45845787 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Thu, 30 May 2019 11:13:57 +0800 Subject: dt-bindings: imx: Correct pinfunc head file path for i.MX8MM The i.MX8MM pinfunc head file is located in DT folder, correct it. Signed-off-by: Anson Huang Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt index 524a16fca666..e4e01c05cf83 100644 --- a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt @@ -12,7 +12,7 @@ Required properties in sub-nodes: - fsl,pins: each entry consists of 6 integers and represents the mux and config setting for one pin. The first 5 integers are specified using a PIN_FUNC_ID macro, which can be found in - . The last integer CONFIG is + . The last integer CONFIG is the pad setting value like pull-up on this pin. Please refer to i.MX8M Mini Reference Manual for detailed CONFIG settings. -- cgit From 7ea6a2edbfd0a98f729f6f4705535bc53ec1a89f Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 3 Jun 2019 13:04:20 +0530 Subject: dt-bindings: pinctrl: Document drive strength settings for BM1880 SoC Document drive strength settings for Bitmain BM1880 SoC. Signed-off-by: Manivannan Sadhasivam Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt index 4eb089bcb5f3..4980776122cc 100644 --- a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt @@ -100,6 +100,17 @@ Optional Properties: Valid values are: <0> - Slow <1> - Fast +- drive-strength: Integer. Selects the drive strength for the specified + pins in mA. + Valid values are: + <4> + <8> + <12> + <16> + <20> + <24> + <28> + <32> Example: pinctrl: pinctrl@400 { -- cgit From 53a5372ce326116f3e3d3f1d701113b2542509f4 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 4 Jun 2019 00:19:59 -0700 Subject: pinctrl: qcom: sdm845: Expose ufs_reset as gpio The ufs_reset pin is expected to be wired to the reset pin of the primary UFS memory but is pretty much just a general purpose output pinr Reorder the pins and expose it as gpio 150, so that the UFS driver can toggle it. Signed-off-by: Bjorn Andersson Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt index 321bdb9be0d2..7462e3743c68 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt @@ -79,7 +79,7 @@ to specify in a pin configuration subnode: gpio0-gpio149 Supports mux, bias and drive-strength - sdc2_clk, sdc2_cmd, sdc2_data + sdc2_clk, sdc2_cmd, sdc2_data, ufs_reset Supports bias and drive-strength - function: -- cgit From 76c4c597b2ef59900933fca8893428f0555afaab Mon Sep 17 00:00:00 2001 From: Hongwei Zhang Date: Tue, 4 Jun 2019 17:53:32 -0400 Subject: pinctrl: aspeed: Add SGPM pinmux Add SGPM pinmux to ast2500-pinctrl function and group, to prepare for supporting SGPIO in AST2500 SoC. Signed-off-by: Hongwei Zhang Reviewed-by: Andrew Jeffery Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt index 3b7266c7c438..8f1c5c4d62f9 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt @@ -84,7 +84,7 @@ NDCD2 NDCD3 NDCD4 NDSR1 NDSR2 NDSR3 NDSR4 NDTR1 NDTR2 NDTR3 NDTR4 NRI1 NRI2 NRI3 NRI4 NRTS1 NRTS2 NRTS3 NRTS4 OSCCLK PEWAKE PNOR PWM0 PWM1 PWM2 PWM3 PWM4 PWM5 PWM6 PWM7 RGMII1 RGMII2 RMII1 RMII2 RXD1 RXD2 RXD3 RXD4 SALT1 SALT10 SALT11 SALT12 SALT13 SALT14 SALT2 SALT3 SALT4 SALT5 SALT6 SALT7 SALT8 SALT9 -SCL1 SCL2 SD1 SD2 SDA1 SDA2 SGPS1 SGPS2 SIOONCTRL SIOPBI SIOPBO SIOPWREQ +SCL1 SCL2 SD1 SD2 SDA1 SDA2 SGPM SGPS1 SGPS2 SIOONCTRL SIOPBI SIOPBO SIOPWREQ SIOPWRGD SIOS3 SIOS5 SIOSCI SPI1 SPI1CS1 SPI1DEBUG SPI1PASSTHRU SPI2CK SPI2CS0 SPI2CS1 SPI2MISO SPI2MOSI TIMER3 TIMER4 TIMER5 TIMER6 TIMER7 TIMER8 TXD1 TXD2 TXD3 TXD4 UART6 USB11BHID USB2AD USB2AH USB2BD USB2BH USBCKI VGABIOSROM VGAHS -- cgit From 26207c7e787248bba8b8cee8bb6db36ecc7c0a13 Mon Sep 17 00:00:00 2001 From: Fabien Lahoudere Date: Thu, 23 May 2019 11:07:37 +0200 Subject: docs: iio: add precision about sampling_frequency_available The documentation give some exemple on what format can be expected from sampling_frequency_available sysfs attribute Signed-off-by: Fabien Lahoudere Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 6aef7dbbde44..680451695422 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -61,8 +61,11 @@ What: /sys/bus/iio/devices/triggerX/sampling_frequency_available KernelVersion: 2.6.35 Contact: linux-iio@vger.kernel.org Description: - When the internal sampling clock can only take a small - discrete set of values, this file lists those available. + When the internal sampling clock can only take a specific set of + frequencies, we can specify the available values with: + - a small discrete set of values like "0 2 4 6 8" + - a range with minimum, step and maximum frequencies like + "[min step max]" What: /sys/bus/iio/devices/iio:deviceX/oversampling_ratio KernelVersion: 2.6.38 -- cgit From d6e561df50b5a7e33273f66f97bf2b4ff25f13b4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 7 Jun 2019 13:06:12 +0200 Subject: dt-bindings: pinctrl: pic32: Spelling s/configuraion/configuration/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt index 29b72e303ebf..51efd2085113 100644 --- a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt @@ -5,7 +5,7 @@ Please refer to pinctrl-bindings.txt, ../gpio/gpio.txt, and pin controller, GPIO, and interrupt bindings. PIC32 'pin configuration node' is a node of a group of pins which can be -used for a specific device or function. This node represents configuraions of +used for a specific device or function. This node represents configurations of pins, optional function, and optional mux related configuration. Required properties for pin controller node: -- cgit From 671c30957e78a822917cf0b04c4592e9813f7f9b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:17 -0300 Subject: ABI: sysfs-devices-system-cpu: point to the right docs The cpuidle doc was split on two, one at the admin guide and another one at the driver API guide. Instead of pointing to a non-existent file, point to both (admin guide being the first one). Signed-off-by: Mauro Carvalho Chehab Acked-by: Rafael J. Wysocki Signed-off-by: Jonathan Corbet --- Documentation/ABI/testing/sysfs-devices-system-cpu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 1528239f69b2..87478ac6c2af 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -137,7 +137,8 @@ Description: Discover cpuidle policy and mechanism current_governor: (RW) displays current idle policy. Users can switch the governor at runtime by writing to this file. - See files in Documentation/cpuidle/ for more information. + See Documentation/admin-guide/pm/cpuidle.rst and + Documentation/driver-api/pm/cpuidle.rst for more information. What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name -- cgit From 065efe27872ca942b53b9f11d5b3f534a9c33857 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:19 -0300 Subject: docs: zh_CN: get rid of basic_profiling.txt Changeset 5700d1974818 ("docs: Get rid of the "basic profiling" guide") removed an old basic-profiling.txt file that was not updated over the last 11 years and won't reflect the post-perf era. It makes no sense to keep its translation, so get rid of it too. Fixes: 5700d1974818 ("docs: Get rid of the "basic profiling" guide") Signed-off-by: Mauro Carvalho Chehab Acked-by: Alex Shi Signed-off-by: Jonathan Corbet --- .../translations/zh_CN/basic_profiling.txt | 71 ---------------------- 1 file changed, 71 deletions(-) delete mode 100644 Documentation/translations/zh_CN/basic_profiling.txt (limited to 'Documentation') diff --git a/Documentation/translations/zh_CN/basic_profiling.txt b/Documentation/translations/zh_CN/basic_profiling.txt deleted file mode 100644 index 1e6bf0bdf8f5..000000000000 --- a/Documentation/translations/zh_CN/basic_profiling.txt +++ /dev/null @@ -1,71 +0,0 @@ -Chinese translated version of Documentation/basic_profiling - -If you have any comment or update to the content, please post to LKML directly. -However, if you have problem communicating in English you can also ask the -Chinese maintainer for help. Contact the Chinese maintainer, if this -translation is outdated or there is problem with translation. - -Chinese maintainer: Liang Xie ---------------------------------------------------------------------- -Documentation/basic_profiling的中文翻译 - -如果想评论或更新本文的内容,请直接发信到LKML。如果你使用英文交流有困难的话,也可 -以向中文版维护者求助。如果本翻译更新不及时或者翻译存在问题,请联系中文版维护者。 - -中文版维护者: 谢良 Liang Xie -中文版翻译者: 谢良 Liang Xie -中文版校译者: -以下为正文 ---------------------------------------------------------------------- - -下面这些说明指令都是非常基础的,如果你想进一步了解请阅读相关专业文档:) -请不要再在本文档增加新的内容,但可以修复文档中的错误:)(mbligh@aracnet.com) -感谢John Levon,Dave Hansen等在撰写时的帮助 - - 用于表示要测量的目标 -请先确保您已经有正确的System.map / vmlinux配置! - -对于linux系统来说,配置vmlinuz最容易的方法可能就是使用“make install”,然后修改 -/sbin/installkernel将vmlinux拷贝到/boot目录,而System.map通常是默认安装好的 - -Readprofile ------------ -2.6系列内核需要版本相对较新的readprofile,比如util-linux 2.12a中包含的,可以从: - -http://www.kernel.org/pub/linux/utils/util-linux/ 下载 - -大部分linux发行版已经包含了. - -启用readprofile需要在kernel启动命令行增加”profile=2“ - -clear readprofile -r - -dump output readprofile -m /boot/System.map > captured_profile - -Oprofile --------- - -从http://oprofile.sourceforge.net/获取源代码(请参考Changes以获取匹配的版本) -在kernel启动命令行增加“idle=poll” - -配置CONFIG_PROFILING=y和CONFIG_OPROFILE=y然后重启进入新kernel - -./configure --with-kernel-support -make install - -想得到好的测量结果,请确保启用了本地APIC特性。如果opreport显示有0Hz CPU, -说明APIC特性没有开启。另外注意idle=poll选项可能有损性能。 - -One time setup: - opcontrol --setup --vmlinux=/boot/vmlinux - -clear opcontrol --reset -start opcontrol --start - -stop opcontrol --stop -dump output opreport > output_file - -如果只看kernel相关的报告结果,请运行命令 opreport -l /boot/vmlinux > output_file - -通过reset选项可以清理过期统计数据,相当于重启的效果。 - -- cgit From 2e03e3a42c961b709926ba5f7c42c09ea6bfb8c1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:20 -0300 Subject: docs: mm: numaperf.rst: get rid of a build warning When building it, it gets this warning: Documentation/admin-guide/mm/numaperf.rst:168: WARNING: Footnote [1] is not referenced. The problem is that this is not really a reference, as it is not mentioned within the documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/mm/numaperf.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index c067ed145158..a80c3c37226e 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -165,5 +165,6 @@ write-through caching. ======== See Also ======== -.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf - Section 5.2.27 + +[1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf +- Section 5.2.27 -- cgit From d857a3ffd3d609d1c822b255d4fe4db8b3464e34 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:21 -0300 Subject: docs: bpf: get rid of two warnings Documentation/bpf/btf.rst:154: WARNING: Unexpected indentation. Documentation/bpf/btf.rst:163: WARNING: Unexpected indentation. Signed-off-by: Mauro Carvalho Chehab Acked-by: Song Liu Signed-off-by: Jonathan Corbet --- Documentation/bpf/btf.rst | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 8820360d00da..4ae022d274ab 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -151,6 +151,7 @@ for the type. The maximum value of ``BTF_INT_BITS()`` is 128. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values for this int. For example, a bitfield struct member has: + * btf member bit offset 100 from the start of the structure, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` @@ -160,6 +161,7 @@ from bits ``100 + 2 = 102``. Alternatively, the bitfield struct member can be the following to access the same bits as the above: + * btf member bit offset 102, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` -- cgit From 27c054d2939f1a46a4da62732e71c140e664afb9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:22 -0300 Subject: docs: mark orphan documents as such Sphinx doesn't like orphan documents: Documentation/accelerators/ocxl.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/overview.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/stm32f429-overview.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/stm32f746-overview.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/stm32f769-overview.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/stm32h743-overview.rst: WARNING: document isn't included in any toctree Documentation/arm/stm32/stm32mp157-overview.rst: WARNING: document isn't included in any toctree Documentation/gpu/msm-crash-dump.rst: WARNING: document isn't included in any toctree Documentation/interconnect/interconnect.rst: WARNING: document isn't included in any toctree Documentation/laptops/lg-laptop.rst: WARNING: document isn't included in any toctree Documentation/powerpc/isa-versions.rst: WARNING: document isn't included in any toctree Documentation/virtual/kvm/amd-memory-encryption.rst: WARNING: document isn't included in any toctree Documentation/virtual/kvm/vcpu-requests.rst: WARNING: document isn't included in any toctree So, while they aren't on any toctree, add :orphan: to them, in order to silent this warning. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andrew Donnellan Signed-off-by: Jonathan Corbet --- Documentation/accelerators/ocxl.rst | 2 ++ Documentation/arm/stm32/overview.rst | 2 ++ Documentation/arm/stm32/stm32f429-overview.rst | 2 ++ Documentation/arm/stm32/stm32f746-overview.rst | 2 ++ Documentation/arm/stm32/stm32f769-overview.rst | 2 ++ Documentation/arm/stm32/stm32h743-overview.rst | 2 ++ Documentation/arm/stm32/stm32mp157-overview.rst | 2 ++ Documentation/gpu/msm-crash-dump.rst | 2 ++ Documentation/interconnect/interconnect.rst | 2 ++ Documentation/laptops/lg-laptop.rst | 2 ++ Documentation/powerpc/isa-versions.rst | 2 ++ 11 files changed, 22 insertions(+) (limited to 'Documentation') diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst index 14cefc020e2d..b1cea19a90f5 100644 --- a/Documentation/accelerators/ocxl.rst +++ b/Documentation/accelerators/ocxl.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================================== OpenCAPI (Open Coherent Accelerator Processor Interface) ======================================================== diff --git a/Documentation/arm/stm32/overview.rst b/Documentation/arm/stm32/overview.rst index 85cfc8410798..f7e734153860 100644 --- a/Documentation/arm/stm32/overview.rst +++ b/Documentation/arm/stm32/overview.rst @@ -1,3 +1,5 @@ +:orphan: + ======================== STM32 ARM Linux Overview ======================== diff --git a/Documentation/arm/stm32/stm32f429-overview.rst b/Documentation/arm/stm32/stm32f429-overview.rst index 18feda97f483..65bbb1c3b423 100644 --- a/Documentation/arm/stm32/stm32f429-overview.rst +++ b/Documentation/arm/stm32/stm32f429-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F429 Overview ================== diff --git a/Documentation/arm/stm32/stm32f746-overview.rst b/Documentation/arm/stm32/stm32f746-overview.rst index b5f4b6ce7656..42d593085015 100644 --- a/Documentation/arm/stm32/stm32f746-overview.rst +++ b/Documentation/arm/stm32/stm32f746-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F746 Overview ================== diff --git a/Documentation/arm/stm32/stm32f769-overview.rst b/Documentation/arm/stm32/stm32f769-overview.rst index 228656ced2fe..f6adac862b17 100644 --- a/Documentation/arm/stm32/stm32f769-overview.rst +++ b/Documentation/arm/stm32/stm32f769-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F769 Overview ================== diff --git a/Documentation/arm/stm32/stm32h743-overview.rst b/Documentation/arm/stm32/stm32h743-overview.rst index 3458dc00095d..c525835e7473 100644 --- a/Documentation/arm/stm32/stm32h743-overview.rst +++ b/Documentation/arm/stm32/stm32h743-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32H743 Overview ================== diff --git a/Documentation/arm/stm32/stm32mp157-overview.rst b/Documentation/arm/stm32/stm32mp157-overview.rst index 62e176d47ca7..2c52cd020601 100644 --- a/Documentation/arm/stm32/stm32mp157-overview.rst +++ b/Documentation/arm/stm32/stm32mp157-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32MP157 Overview =================== diff --git a/Documentation/gpu/msm-crash-dump.rst b/Documentation/gpu/msm-crash-dump.rst index 757cd257e0d8..240ef200f76c 100644 --- a/Documentation/gpu/msm-crash-dump.rst +++ b/Documentation/gpu/msm-crash-dump.rst @@ -1,3 +1,5 @@ +:orphan: + ===================== MSM Crash Dump Format ===================== diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/interconnect/interconnect.rst index c3e004893796..56e331dab70e 100644 --- a/Documentation/interconnect/interconnect.rst +++ b/Documentation/interconnect/interconnect.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +:orphan: + ===================================== GENERIC SYSTEM INTERCONNECT SUBSYSTEM ===================================== diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst index aa503ee9b3bc..f2c2ffe31101 100644 --- a/Documentation/laptops/lg-laptop.rst +++ b/Documentation/laptops/lg-laptop.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0+ +:orphan: + LG Gram laptop extra features ============================= diff --git a/Documentation/powerpc/isa-versions.rst b/Documentation/powerpc/isa-versions.rst index 812e20cc898c..66c24140ebf1 100644 --- a/Documentation/powerpc/isa-versions.rst +++ b/Documentation/powerpc/isa-versions.rst @@ -1,3 +1,5 @@ +:orphan: + CPU to ISA Version Mapping ========================== -- cgit From f672febc3d132ea0487c63367455124dfa39e30f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:23 -0300 Subject: docs: amd-memory-encryption.rst get rid of warnings Get rid of those warnings: Documentation/virtual/kvm/amd-memory-encryption.rst:244: WARNING: Citation [white-paper] is not referenced. Documentation/virtual/kvm/amd-memory-encryption.rst:246: WARNING: Citation [amd-apm] is not referenced. Documentation/virtual/kvm/amd-memory-encryption.rst:247: WARNING: Citation [kvm-forum] is not referenced. For references that aren't mentioned at the text by adding an explicit reference to them. Signed-off-by: Mauro Carvalho Chehab Acked-by: Paolo Bonzini Signed-off-by: Jonathan Corbet --- Documentation/virtual/kvm/amd-memory-encryption.rst | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst index 659bbc093b52..d18c97b4e140 100644 --- a/Documentation/virtual/kvm/amd-memory-encryption.rst +++ b/Documentation/virtual/kvm/amd-memory-encryption.rst @@ -241,6 +241,9 @@ Returns: 0 on success, -negative on error References ========== + +See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. + .. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf .. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf .. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) -- cgit From d0727cc650f38243c0ac63fd8c91bfd63e3e2578 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:24 -0300 Subject: docs: zh_CN: avoid duplicate citation references Documentation/process/management-style.rst:35: WARNING: duplicate label decisions, other instance in Documentation/translations/zh_CN/process/management-style.rst Documentation/process/programming-language.rst:37: WARNING: duplicate citation c-language, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:38: WARNING: duplicate citation gcc, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:39: WARNING: duplicate citation clang, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:40: WARNING: duplicate citation icc, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:41: WARNING: duplicate citation gcc-c-dialect-options, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:42: WARNING: duplicate citation gnu-extensions, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:43: WARNING: duplicate citation gcc-attribute-syntax, other instance in Documentation/translations/zh_CN/process/programming-language.rst Documentation/process/programming-language.rst:44: WARNING: duplicate citation n2049, other instance in Documentation/translations/zh_CN/process/programming-language.rst Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- .../zh_CN/process/management-style.rst | 4 +- .../zh_CN/process/programming-language.rst | 59 +++++++++++++++++----- 2 files changed, 47 insertions(+), 16 deletions(-) (limited to 'Documentation') diff --git a/Documentation/translations/zh_CN/process/management-style.rst b/Documentation/translations/zh_CN/process/management-style.rst index a181fa56d19e..c6a5bb285797 100644 --- a/Documentation/translations/zh_CN/process/management-style.rst +++ b/Documentation/translations/zh_CN/process/management-style.rst @@ -28,7 +28,7 @@ Linux内核管理风格 不管怎样,这里是: -.. _decisions: +.. _cn_decisions: 1)决策 ------- @@ -108,7 +108,7 @@ Linux内核管理风格 但是,为了做好作为内核管理者的准备,最好记住不要烧掉任何桥梁,不要轰炸任何 无辜的村民,也不要疏远太多的内核开发人员。事实证明,疏远人是相当容易的,而 亲近一个疏远的人是很难的。因此,“疏远”立即属于“不可逆”的范畴,并根据 -:ref:`decisions` 成为绝不可以做的事情。 +:ref:`cn_decisions` 成为绝不可以做的事情。 这里只有几个简单的规则: diff --git a/Documentation/translations/zh_CN/process/programming-language.rst b/Documentation/translations/zh_CN/process/programming-language.rst index 51fd4ef48ea1..2a47a1d2ec20 100644 --- a/Documentation/translations/zh_CN/process/programming-language.rst +++ b/Documentation/translations/zh_CN/process/programming-language.rst @@ -8,21 +8,21 @@ 程序设计语言 ============ -内核是用C语言 [c-language]_ 编写的。更准确地说,内核通常是用 ``gcc`` [gcc]_ -在 ``-std=gnu89`` [gcc-c-dialect-options]_ 下编译的:ISO C90的 GNU 方言( +内核是用C语言 :ref:`c-language ` 编写的。更准确地说,内核通常是用 :ref:`gcc ` +在 ``-std=gnu89`` :ref:`gcc-c-dialect-options ` 下编译的:ISO C90的 GNU 方言( 包括一些C99特性) -这种方言包含对语言 [gnu-extensions]_ 的许多扩展,当然,它们许多都在内核中使用。 +这种方言包含对语言 :ref:`gnu-extensions ` 的许多扩展,当然,它们许多都在内核中使用。 -对于一些体系结构,有一些使用 ``clang`` [clang]_ 和 ``icc`` [icc]_ 编译内核 +对于一些体系结构,有一些使用 :ref:`clang ` 和 :ref:`icc ` 编译内核 的支持,尽管在编写此文档时还没有完成,仍需要第三方补丁。 属性 ---- -在整个内核中使用的一个常见扩展是属性(attributes) [gcc-attribute-syntax]_ +在整个内核中使用的一个常见扩展是属性(attributes) :ref:`gcc-attribute-syntax ` 属性允许将实现定义的语义引入语言实体(如变量、函数或类型),而无需对语言进行 -重大的语法更改(例如添加新关键字) [n2049]_ +重大的语法更改(例如添加新关键字) :ref:`n2049 ` 在某些情况下,属性是可选的(即不支持这些属性的编译器仍然应该生成正确的代码, 即使其速度较慢或执行的编译时检查/诊断次数不够) @@ -31,11 +31,42 @@ ``__attribute__((__pure__))`` ),以检测可以使用哪些关键字和/或缩短代码, 具体 请参阅 ``include/linux/compiler_attributes.h`` -.. [c-language] http://www.open-std.org/jtc1/sc22/wg14/www/standards -.. [gcc] https://gcc.gnu.org -.. [clang] https://clang.llvm.org -.. [icc] https://software.intel.com/en-us/c-compilers -.. [gcc-c-dialect-options] https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html -.. [gnu-extensions] https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html -.. [gcc-attribute-syntax] https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html -.. [n2049] http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf +.. _cn_c-language: + +c-language + http://www.open-std.org/jtc1/sc22/wg14/www/standards + +.. _cn_gcc: + +gcc + https://gcc.gnu.org + +.. _cn_clang: + +clang + https://clang.llvm.org + +.. _cn_icc: + +icc + https://software.intel.com/en-us/c-compilers + +.. _cn_gcc-c-dialect-options: + +c-dialect-options + https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html + +.. _cn_gnu-extensions: + +gnu-extensions + https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html + +.. _cn_gcc-attribute-syntax: + +gcc-attribute-syntax + https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html + +.. _cn_n2049: + +n2049 + http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf -- cgit From ea0ad8763b17395fc611f6d91d1de389ec0cc584 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:25 -0300 Subject: docs: it: license-rules.rst: get rid of warnings There's a wrong identation on a code block, and it tries to use a reference that was not defined at the Italian translation. Documentation/translations/it_IT/process/license-rules.rst:329: WARNING: Literal block expected; none found. Documentation/translations/it_IT/process/license-rules.rst:332: WARNING: Unexpected indentation. Documentation/translations/it_IT/process/license-rules.rst:339: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/translations/it_IT/process/license-rules.rst:341: WARNING: Unexpected indentation. Documentation/translations/it_IT/process/license-rules.rst:305: WARNING: Unknown target name: "metatags". Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Federico Vaga Signed-off-by: Jonathan Corbet --- .../translations/it_IT/process/license-rules.rst | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/translations/it_IT/process/license-rules.rst b/Documentation/translations/it_IT/process/license-rules.rst index f058e06996dc..4cd87a3a7bf9 100644 --- a/Documentation/translations/it_IT/process/license-rules.rst +++ b/Documentation/translations/it_IT/process/license-rules.rst @@ -303,7 +303,7 @@ essere categorizzate in: LICENSES/dual I file in questa cartella contengono il testo completo della rispettiva - licenza e i suoi `Metatags`_. I nomi dei file sono identici agli + licenza e i suoi `Metatag`_. I nomi dei file sono identici agli identificatori di licenza SPDX che dovrebbero essere usati nei file sorgenti. @@ -326,19 +326,19 @@ essere categorizzate in: Esempio del formato del file:: - Valid-License-Identifier: MPL-1.1 - SPDX-URL: https://spdx.org/licenses/MPL-1.1.html - Usage-Guide: - Do NOT use. The MPL-1.1 is not GPL2 compatible. It may only be used for - dual-licensed files where the other license is GPL2 compatible. - If you end up using this it MUST be used together with a GPL2 compatible - license using "OR". - To use the Mozilla Public License version 1.1 put the following SPDX - tag/value pair into a comment according to the placement guidelines in - the licensing rules documentation: - SPDX-License-Identifier: MPL-1.1 - License-Text: - Full license text + Valid-License-Identifier: MPL-1.1 + SPDX-URL: https://spdx.org/licenses/MPL-1.1.html + Usage-Guide: + Do NOT use. The MPL-1.1 is not GPL2 compatible. It may only be used for + dual-licensed files where the other license is GPL2 compatible. + If you end up using this it MUST be used together with a GPL2 compatible + license using "OR". + To use the Mozilla Public License version 1.1 put the following SPDX + tag/value pair into a comment according to the placement guidelines in + the licensing rules documentation: + SPDX-License-Identifier: MPL-1.1 + License-Text: + Full license text | -- cgit From 6ad8b21652ec26a5ad51ffc91470e15c19156548 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:27 -0300 Subject: docs: security: trusted-encrypted.rst: fix code-block tag The code-block tag is at the wrong place, causing those warnings: Documentation/security/keys/trusted-encrypted.rst:112: WARNING: Literal block expected; none found. Documentation/security/keys/trusted-encrypted.rst:121: WARNING: Unexpected indentation. Documentation/security/keys/trusted-encrypted.rst:122: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/security/keys/trusted-encrypted.rst:123: WARNING: Block quote ends without a blank line; unexpected unindent. Signed-off-by: Mauro Carvalho Chehab Acked-by: James Morris Acked-by: Jarkko Sakkinen Signed-off-by: Jonathan Corbet --- Documentation/security/keys/trusted-encrypted.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst index 7b35fcb58933..50ac8bcd6970 100644 --- a/Documentation/security/keys/trusted-encrypted.rst +++ b/Documentation/security/keys/trusted-encrypted.rst @@ -107,12 +107,14 @@ Where:: Examples of trusted and encrypted key usage: -Create and save a trusted key named "kmk" of length 32 bytes:: +Create and save a trusted key named "kmk" of length 32 bytes. Note: When using a TPM 2.0 with a persistent key with handle 0x81000001, append 'keyhandle=0x81000001' to statements between quotes, such as "new 32 keyhandle=0x81000001". +:: + $ keyctl add trusted kmk "new 32" @u 440502848 -- cgit From 43415f13276f09623b1b61376c6f2e43f71bedbb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:28 -0300 Subject: docs: security: core.rst: Fix several warnings Multi-line literal markups only work when they're idented at the same level, with is not the case here: Documentation/security/keys/core.rst:1597: WARNING: Inline literal start-string without end-string. Documentation/security/keys/core.rst:1597: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1597: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1598: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1598: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1600: WARNING: Inline literal start-string without end-string. Documentation/security/keys/core.rst:1600: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1600: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1600: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1600: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1666: WARNING: Inline literal start-string without end-string. Documentation/security/keys/core.rst:1666: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1666: WARNING: Inline emphasis start-string without end-string. Documentation/security/keys/core.rst:1666: WARNING: Inline emphasis start-string without end-string. Fix it by using a code-block instead. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/security/keys/core.rst | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 9521c4207f01..3fd60dcb2dc6 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1594,10 +1594,12 @@ The structure has a number of fields, some of which are mandatory: attempted key link operation. If there is no match, -EINVAL is returned. - * ``int (*asym_eds_op)(struct kernel_pkey_params *params, - const void *in, void *out);`` - ``int (*asym_verify_signature)(struct kernel_pkey_params *params, - const void *in, const void *in2);`` + * ``asym_eds_op`` and ``asym_verify_signature``:: + + int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); + int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2); These methods are optional. If provided the first allows a key to be used to encrypt, decrypt or sign a blob of data, and the second allows a @@ -1662,8 +1664,10 @@ The structure has a number of fields, some of which are mandatory: required crypto isn't available. - * ``int (*asym_query)(const struct kernel_pkey_params *params, - struct kernel_pkey_query *info);`` + * ``asym_query``:: + + int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); This method is optional. If provided it allows information about the public or asymmetric key held in the key to be determined. -- cgit From c6fff4d3b2f467dd62ee8c69e49c8a8795fe7400 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:30 -0300 Subject: docs: net: sja1105.rst: fix table format There's a table there with produces two warnings when built with Sphinx: Documentation/networking/dsa/sja1105.rst:91: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/networking/dsa/sja1105.rst:91: WARNING: Block quote ends without a blank line; unexpected unindent. It will still produce a table, but the html output is wrong, as it won't interpret the second line as the continuation for the first ones, because identation doesn't match. After the change, the output looks a way better and we got rid of two warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Vladimir Oltean Signed-off-by: Jonathan Corbet --- Documentation/networking/dsa/sja1105.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index ea7bac438cfd..cb2858dece93 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -86,13 +86,13 @@ functionality. The following traffic modes are supported over the switch netdevices: +--------------------+------------+------------------+------------------+ -| | Standalone | Bridged with | Bridged with | -| | ports | vlan_filtering 0 | vlan_filtering 1 | +| | Standalone | Bridged with | Bridged with | +| | ports | vlan_filtering 0 | vlan_filtering 1 | +====================+============+==================+==================+ | Regular traffic | Yes | Yes | No (use master) | +--------------------+------------+------------------+------------------+ | Management traffic | Yes | Yes | Yes | -| (BPDU, PTP) | | | | +| (BPDU, PTP) | | | | +--------------------+------------+------------------+------------------+ Switching features -- cgit From 14b767430a58046bfef8ff9b9f12854e20343092 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:29 -0300 Subject: docs: net: dpio-driver.rst: fix two codeblock warnings Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst:43: WARNING: Definition list ends without a blank line; unexpected unindent. Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst:63: WARNING: Unexpected indentation. looking for now-outdated files... none found Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- .../networking/device_drivers/freescale/dpaa2/dpio-driver.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst index 5045df990a4c..17dbee1ac53e 100644 --- a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst +++ b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst @@ -39,8 +39,7 @@ The Linux DPIO driver consists of 3 primary components-- DPIO service-- provides APIs to other Linux drivers for services - QBman portal interface-- sends portal commands, gets responses -:: + QBman portal interface-- sends portal commands, gets responses:: fsl-mc other bus drivers @@ -60,6 +59,7 @@ The Linux DPIO driver consists of 3 primary components-- The diagram below shows how the DPIO driver components fit with the other DPAA2 Linux driver components:: + +------------+ | OS Network | | Stack | -- cgit From 1eecbcdca2bd8d96881cace19ad105dc0f0263f5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:31 -0300 Subject: docs: move protection-keys.rst to the core-api book This document is used by multiple architectures: $ echo $(git grep -l pkey_mprotect arch|cut -d'/' -f 2|sort|uniq) alpha arm arm64 ia64 m68k microblaze mips parisc powerpc s390 sh sparc x86 xtensa So, let's move it to the core book and adjust the links to it accordingly. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 + Documentation/core-api/protection-keys.rst | 99 ++++++++++++++++++++++++++++++ Documentation/x86/index.rst | 1 - Documentation/x86/protection-keys.rst | 99 ------------------------------ 4 files changed, 100 insertions(+), 100 deletions(-) create mode 100644 Documentation/core-api/protection-keys.rst delete mode 100644 Documentation/x86/protection-keys.rst (limited to 'Documentation') diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index ee1bb8983a88..2466a4c51031 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -34,6 +34,7 @@ Core utilities timekeeping boot-time-mm memory-hotplug + protection-keys Interfaces for kernel debugging diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst new file mode 100644 index 000000000000..49d9833af871 --- /dev/null +++ b/Documentation/core-api/protection-keys.rst @@ -0,0 +1,99 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Memory Protection Keys +====================== + +Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature +which is found on Intel's Skylake "Scalable Processor" Server CPUs. +It will be avalable in future non-server parts. + +For anyone wishing to test or use this feature, it is available in +Amazon's EC2 C5 instances and is known to work there using an Ubuntu +17.04 image. + +Memory Protection Keys provides a mechanism for enforcing page-based +protections, but without requiring modification of the page tables +when an application changes protection domains. It works by +dedicating 4 previously ignored bits in each page table entry to a +"protection key", giving 16 possible keys. + +There is also a new user-accessible register (PKRU) with two separate +bits (Access Disable and Write Disable) for each key. Being a CPU +register, PKRU is inherently thread-local, potentially giving each +thread a different set of protections from every other thread. + +There are two new instructions (RDPKRU/WRPKRU) for reading and writing +to the new register. The feature is only available in 64-bit mode, +even though there is theoretically space in the PAE PTEs. These +permissions are enforced on data access only and have no effect on +instruction fetches. + +Syscalls +======== + +There are 3 system calls which directly interact with pkeys:: + + int pkey_alloc(unsigned long flags, unsigned long init_access_rights) + int pkey_free(int pkey); + int pkey_mprotect(unsigned long start, size_t len, + unsigned long prot, int pkey); + +Before a pkey can be used, it must first be allocated with +pkey_alloc(). An application calls the WRPKRU instruction +directly in order to change access permissions to memory covered +with a key. In this example WRPKRU is wrapped by a C function +called pkey_set(). +:: + + int real_prot = PROT_READ|PROT_WRITE; + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE); + ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); + ... application runs here + +Now, if the application needs to update the data at 'ptr', it can +gain access, do the update, then remove its write access:: + + pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE + *ptr = foo; // assign something + pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again + +Now when it frees the memory, it will also free the pkey since it +is no longer in use:: + + munmap(ptr, PAGE_SIZE); + pkey_free(pkey); + +.. note:: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions. + An example implementation can be found in + tools/testing/selftests/x86/protection_keys.c. + +Behavior +======== + +The kernel attempts to make protection keys consistent with the +behavior of a plain mprotect(). For instance if you do this:: + + mprotect(ptr, size, PROT_NONE); + something(ptr); + +you can expect the same effects with protection keys when doing this:: + + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ); + pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey); + something(ptr); + +That should be true whether something() is a direct access to 'ptr' +like:: + + *ptr = foo; + +or when the kernel does the access on the application's behalf like +with a read():: + + read(fd, ptr, 1); + +The kernel will send a SIGSEGV in both cases, but si_code will be set +to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when +the plain mprotect() permissions are violated. diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index ae36fc5fc649..f2de1b2d3ac7 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -19,7 +19,6 @@ x86-specific Documentation tlb mtrr pat - protection-keys intel_mpx amd-memory-encryption pti diff --git a/Documentation/x86/protection-keys.rst b/Documentation/x86/protection-keys.rst deleted file mode 100644 index 49d9833af871..000000000000 --- a/Documentation/x86/protection-keys.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -Memory Protection Keys -====================== - -Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature -which is found on Intel's Skylake "Scalable Processor" Server CPUs. -It will be avalable in future non-server parts. - -For anyone wishing to test or use this feature, it is available in -Amazon's EC2 C5 instances and is known to work there using an Ubuntu -17.04 image. - -Memory Protection Keys provides a mechanism for enforcing page-based -protections, but without requiring modification of the page tables -when an application changes protection domains. It works by -dedicating 4 previously ignored bits in each page table entry to a -"protection key", giving 16 possible keys. - -There is also a new user-accessible register (PKRU) with two separate -bits (Access Disable and Write Disable) for each key. Being a CPU -register, PKRU is inherently thread-local, potentially giving each -thread a different set of protections from every other thread. - -There are two new instructions (RDPKRU/WRPKRU) for reading and writing -to the new register. The feature is only available in 64-bit mode, -even though there is theoretically space in the PAE PTEs. These -permissions are enforced on data access only and have no effect on -instruction fetches. - -Syscalls -======== - -There are 3 system calls which directly interact with pkeys:: - - int pkey_alloc(unsigned long flags, unsigned long init_access_rights) - int pkey_free(int pkey); - int pkey_mprotect(unsigned long start, size_t len, - unsigned long prot, int pkey); - -Before a pkey can be used, it must first be allocated with -pkey_alloc(). An application calls the WRPKRU instruction -directly in order to change access permissions to memory covered -with a key. In this example WRPKRU is wrapped by a C function -called pkey_set(). -:: - - int real_prot = PROT_READ|PROT_WRITE; - pkey = pkey_alloc(0, PKEY_DISABLE_WRITE); - ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); - ... application runs here - -Now, if the application needs to update the data at 'ptr', it can -gain access, do the update, then remove its write access:: - - pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE - *ptr = foo; // assign something - pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again - -Now when it frees the memory, it will also free the pkey since it -is no longer in use:: - - munmap(ptr, PAGE_SIZE); - pkey_free(pkey); - -.. note:: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions. - An example implementation can be found in - tools/testing/selftests/x86/protection_keys.c. - -Behavior -======== - -The kernel attempts to make protection keys consistent with the -behavior of a plain mprotect(). For instance if you do this:: - - mprotect(ptr, size, PROT_NONE); - something(ptr); - -you can expect the same effects with protection keys when doing this:: - - pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ); - pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey); - something(ptr); - -That should be true whether something() is a direct access to 'ptr' -like:: - - *ptr = foo; - -or when the kernel does the access on the application's behalf like -with a read():: - - read(fd, ptr, 1); - -The kernel will send a SIGSEGV in both cases, but si_code will be set -to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when -the plain mprotect() permissions are violated. -- cgit From cb1aaebea8d79860181559d7b5d482aea63db113 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:32 -0300 Subject: docs: fix broken documentation links Mostly due to x86 and acpi conversion, several documentation links are still pointing to the old file. Fix them. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Wolfram Sang Reviewed-by: Sven Van Asbroeck Reviewed-by: Bhupesh Sharma Acked-by: Mark Brown Signed-off-by: Jonathan Corbet --- Documentation/acpi/dsd/leds.txt | 2 +- Documentation/admin-guide/kernel-parameters.rst | 6 +++--- Documentation/admin-guide/kernel-parameters.txt | 16 ++++++++-------- Documentation/admin-guide/ras.rst | 2 +- Documentation/devicetree/bindings/net/fsl-enetc.txt | 7 +++---- .../devicetree/bindings/pci/amlogic,meson-pcie.txt | 2 +- .../bindings/regulator/qcom,rpmh-regulator.txt | 2 +- Documentation/devicetree/booting-without-of.txt | 2 +- Documentation/driver-api/gpio/board.rst | 2 +- Documentation/driver-api/gpio/consumer.rst | 2 +- Documentation/firmware-guide/acpi/enumeration.rst | 2 +- Documentation/firmware-guide/acpi/method-tracing.rst | 2 +- Documentation/i2c/instantiating-devices | 2 +- Documentation/sysctl/kernel.txt | 4 ++-- Documentation/translations/zh_CN/process/4.Coding.rst | 2 +- Documentation/x86/x86_64/5level-paging.rst | 2 +- Documentation/x86/x86_64/boot-options.rst | 4 ++-- Documentation/x86/x86_64/fake-numa-for-cpusets.rst | 2 +- 18 files changed, 31 insertions(+), 32 deletions(-) (limited to 'Documentation') diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt index 81a63af42ed2..cc58b1a574c5 100644 --- a/Documentation/acpi/dsd/leds.txt +++ b/Documentation/acpi/dsd/leds.txt @@ -96,4 +96,4 @@ where , referenced 2019-02-21. -[7] Documentation/acpi/dsd/data-node-reference.txt +[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 0124980dca2d..8d3273e32eb1 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -167,7 +167,7 @@ parameter is applicable:: X86-32 X86-32, aka i386 architecture is enabled. X86-64 X86-64 architecture is enabled. More X86-64 boot options can be found in - Documentation/x86/x86_64/boot-options.txt . + Documentation/x86/x86_64/boot-options.rst. X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) X86_UV SGI UV support is enabled. XEN Xen support is enabled @@ -181,10 +181,10 @@ In addition, the following text indicates that the option:: Parameters denoted with BOOT are actually interpreted by the boot loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme -need or coordination with . +need or coordination with . There are also arch-specific kernel-parameters not documented here. -See for example . +See for example . Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 79d043b8850d..1abd7e145357 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -53,7 +53,7 @@ ACPI_DEBUG_PRINT statements, e.g., ACPI_DEBUG_PRINT((ACPI_DB_INFO, ... The debug_level mask defaults to "info". See - Documentation/acpi/debug.txt for more information about + Documentation/firmware-guide/acpi/debug.rst for more information about debug layers and levels. Enable processor driver info messages: @@ -963,7 +963,7 @@ for details. nompx [X86] Disables Intel Memory Protection Extensions. - See Documentation/x86/intel_mpx.txt for more + See Documentation/x86/intel_mpx.rst for more information about the feature. nopku [X86] Disable Memory Protection Keys CPU feature found @@ -1189,7 +1189,7 @@ that is to be dynamically loaded by Linux. If there are multiple variables with the same name but with different vendor GUIDs, all of them will be loaded. See - Documentation/acpi/ssdt-overlays.txt for details. + Documentation/admin-guide/acpi/ssdt-overlays.rst for details. eisa_irq_edge= [PARISC,HW] @@ -2383,7 +2383,7 @@ mce [X86-32] Machine Check Exception - mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt + mce=option [X86-64] See Documentation/x86/x86_64/boot-options.rst md= [HW] RAID subsystems devices and level See Documentation/admin-guide/md.rst. @@ -2439,7 +2439,7 @@ set according to the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config option. - See Documentation/memory-hotplug.txt. + See Documentation/admin-guide/mm/memory-hotplug.rst. memmap=exactmap [KNL,X86] Enable setting of an exact E820 memory map, as specified by the user. @@ -2528,7 +2528,7 @@ mem_encrypt=on: Activate SME mem_encrypt=off: Do not activate SME - Refer to Documentation/x86/amd-memory-encryption.txt + Refer to Documentation/virtual/kvm/amd-memory-encryption.rst for details on when memory encryption can be activated. mem_sleep_default= [SUSPEND] Default system suspend mode: @@ -3529,7 +3529,7 @@ See Documentation/blockdev/paride.txt. pirq= [SMP,APIC] Manual mp-table setup - See Documentation/x86/i386/IO-APIC.txt. + See Documentation/x86/i386/IO-APIC.rst. plip= [PPT,NET] Parallel port network link Format: { parport | timid | 0 } @@ -5055,7 +5055,7 @@ Can be used multiple times for multiple devices. vga= [BOOT,X86-32] Select a particular video mode - See Documentation/x86/boot.txt and + See Documentation/x86/boot.rst and Documentation/svga.txt. Use vga=ask for menu. This is actually a boot loader parameter; the value is diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index c7495e42e6f4..2b20f5f7380d 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_. mode). .. [#f3] For more details about the Machine Check Architecture (MCA), - please read Documentation/x86/x86_64/machinecheck at the Kernel tree. + please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree. EDAC - Error Detection And Correction ************************************* diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt index c812e25ae90f..25fc687419db 100644 --- a/Documentation/devicetree/bindings/net/fsl-enetc.txt +++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt @@ -16,8 +16,8 @@ Required properties: In this case, the ENETC node should include a "mdio" sub-node that in turn should contain the "ethernet-phy" node describing the external phy. Below properties are required, their bindings -already defined in ethernet.txt or phy.txt, under -Documentation/devicetree/bindings/net/*. +already defined in Documentation/devicetree/bindings/net/ethernet.txt or +Documentation/devicetree/bindings/net/phy.txt. Required: @@ -51,8 +51,7 @@ Example: connection: In this case, the ENETC port node defines a fixed link connection, -as specified by "fixed-link.txt", under -Documentation/devicetree/bindings/net/*. +as specified by Documentation/devicetree/bindings/net/fixed-link.txt. Required: diff --git a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt index 12b18f82d441..efa2c8b9b85a 100644 --- a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt +++ b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt @@ -3,7 +3,7 @@ Amlogic Meson AXG DWC PCIE SoC controller Amlogic Meson PCIe host controller is based on the Synopsys DesignWare PCI core. It shares common functions with the PCIe DesignWare core driver and inherits common properties defined in -Documentation/devicetree/bindings/pci/designware-pci.txt. +Documentation/devicetree/bindings/pci/designware-pcie.txt. Additional properties are described here: diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt index 7ef2dbe48e8a..14d2eee96b3d 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt @@ -97,7 +97,7 @@ Second Level Nodes - Regulators sent for this regulator including those which are for a strictly lower power state. -Other properties defined in Documentation/devicetree/bindings/regulator.txt +Other properties defined in Documentation/devicetree/bindings/regulator/regulator.txt may also be used. regulator-initial-mode and regulator-allowed-modes may be specified for VRM regulators using mode values from include/dt-bindings/regulator/qcom,rpmh-regulator.h. regulator-allow-bypass diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index e86bd2f64117..60f8640f2b2f 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -277,7 +277,7 @@ it with special cases. the decompressor (the real mode entry point goes to the same 32bit entry point once it switched into protected mode). That entry point supports one calling convention which is documented in - Documentation/x86/boot.txt + Documentation/x86/boot.rst The physical pointer to the device-tree block (defined in chapter II) is passed via setup_data which requires at least boot protocol 2.09. The type filed is defined as diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index b37f3f7b8926..ce91518bf9f4 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -101,7 +101,7 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1:: } For more information about the ACPI GPIO bindings see -Documentation/acpi/gpio-properties.txt. +Documentation/firmware-guide/acpi/gpio-properties.rst. Platform Data ------------- diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index 5e4d8aa68913..fdecb6d711db 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -437,7 +437,7 @@ case, it will be handled by the GPIO subsystem automatically. However, if the _DSD is not present, the mappings between GpioIo()/GpioInt() resources and GPIO connection IDs need to be provided by device drivers. -For details refer to Documentation/acpi/gpio-properties.txt +For details refer to Documentation/firmware-guide/acpi/gpio-properties.rst Interacting With the Legacy GPIO Subsystem diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst index 850be9696931..1252617b520f 100644 --- a/Documentation/firmware-guide/acpi/enumeration.rst +++ b/Documentation/firmware-guide/acpi/enumeration.rst @@ -339,7 +339,7 @@ a code like this:: There are also devm_* versions of these functions which release the descriptors once the device is released. -See Documentation/acpi/gpio-properties.txt for more information about the +See Documentation/firmware-guide/acpi/gpio-properties.rst for more information about the _DSD binding related to GPIOs. MFD devices diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst index d0b077b73f5f..0aa7e2c5d32a 100644 --- a/Documentation/firmware-guide/acpi/method-tracing.rst +++ b/Documentation/firmware-guide/acpi/method-tracing.rst @@ -68,7 +68,7 @@ c. Filter out the debug layer/level matched logs when the specified Where: 0xXXXXXXXX/0xYYYYYYYY - Refer to Documentation/acpi/debug.txt for possible debug layer/level + Refer to Documentation/firmware-guide/acpi/debug.rst for possible debug layer/level masking values. \PPPP.AAAA.TTTT.HHHH Full path of a control method that can be found in the ACPI namespace. diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices index 0d85ac1935b7..5a3e2f331e8c 100644 --- a/Documentation/i2c/instantiating-devices +++ b/Documentation/i2c/instantiating-devices @@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI ------------------------------------------- ACPI can also describe I2C devices. There is special documentation for this -which is currently located at Documentation/acpi/enumeration.txt. +which is currently located at Documentation/firmware-guide/acpi/enumeration.rst. Method 2: Instantiate the devices explicitly diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index f0c86fbb3b48..92f7f34b021a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -155,7 +155,7 @@ is 0x15 and the full version number is 0x234, this file will contain the value 340 = 0x154. See the type_of_loader and ext_loader_type fields in -Documentation/x86/boot.txt for additional information. +Documentation/x86/boot.rst for additional information. ============================================================== @@ -167,7 +167,7 @@ The complete bootloader version number. In the example above, this file will contain the value 564 = 0x234. See the type_of_loader and ext_loader_ver fields in -Documentation/x86/boot.txt for additional information. +Documentation/x86/boot.rst for additional information. ============================================================== diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst index 5301e9d55255..8bb777941394 100644 --- a/Documentation/translations/zh_CN/process/4.Coding.rst +++ b/Documentation/translations/zh_CN/process/4.Coding.rst @@ -241,7 +241,7 @@ scripts/coccinelle目录下已经打包了相当多的内核“语义补丁” 任何添加新用户空间界面的代码(包括新的sysfs或/proc文件)都应该包含该界面的 文档,该文档使用户空间开发人员能够知道他们在使用什么。请参阅 -Documentation/abi/readme,了解如何格式化此文档以及需要提供哪些信息。 +Documentation/ABI/README,了解如何格式化此文档以及需要提供哪些信息。 文件 :ref:`Documentation/admin-guide/kernel-parameters.rst ` 描述了内核的所有引导时间参数。任何添加新参数的补丁都应该向该文件添加适当的 diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/x86/x86_64/5level-paging.rst index ab88a4514163..44856417e6a5 100644 --- a/Documentation/x86/x86_64/5level-paging.rst +++ b/Documentation/x86/x86_64/5level-paging.rst @@ -20,7 +20,7 @@ physical address space. This "ought to be enough for anybody" ©. QEMU 2.9 and later support 5-level paging. Virtual memory layout for 5-level paging is described in -Documentation/x86/x86_64/mm.txt +Documentation/x86/x86_64/mm.rst Enabling 5-level paging diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 2f69836b8445..6a4285a3c7a4 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -9,7 +9,7 @@ only the AMD64 specific ones are listed here. Machine check ============= -Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables. +Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables. mce=off Disable machine check @@ -89,7 +89,7 @@ APICs Don't use the local APIC (alias for i386 compatibility) pirq=... - See Documentation/x86/i386/IO-APIC.txt + See Documentation/x86/i386/IO-APIC.rst noapictimer Don't set up the APIC timer diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 74fbb78b3c67..04df57b9aa3f 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -18,7 +18,7 @@ For more information on the features of cpusets, see Documentation/cgroup-v1/cpusets.txt. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of -configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. +configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. For the purposes of this introduction, we'll assume a very primitive NUMA emulation setup of "numa=fake=4*512,". This will split our system memory into -- cgit From 9915ec28ec7fc79f0f30ebbba5d19bfa17eb7f03 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:34 -0300 Subject: docs: isdn: remove hisax references from kernel-parameters.txt The hisax driver got removed on 85993b8c9786 ("isdn: remove hisax driver"), but a left-over was kept at kernel-parameters.txt. Fixes: 85993b8c9786 ("isdn: remove hisax driver") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 3 --- 1 file changed, 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1abd7e145357..9b16b640ce48 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1388,9 +1388,6 @@ Valid parameters: "on", "off" Default: "on" - hisax= [HW,ISDN] - See Documentation/isdn/README.HiSax. - hlt [BUGS=ARM,SH] hpet= [X86-32,HPET] option to control HPET usage -- cgit From 5c437fa29561f5809ef114ba3a5e80556cc43fb3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:35 -0300 Subject: docs: fs: fix broken links to vfs.txt with was renamed to vfs.rst A recent documentation conversion renamed this file but forgot to update the links. Fixes: af96c1e304f7 ("docs: filesystems: vfs: Convert vfs.txt to RST") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/filesystems/porting | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 3bd1148d8bb6..2813a19389fe 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -330,14 +330,14 @@ unreferenced dentries, and is now only called when the dentry refcount goes to [mandatory] .d_compare() calling convention and locking rules are significantly -changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +changed. Read updated documentation in Documentation/filesystems/vfs.rst (and look at examples of other filesystems) for guidance. --- [mandatory] .d_hash() calling convention and locking rules are significantly -changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +changed. Read updated documentation in Documentation/filesystems/vfs.rst (and look at examples of other filesystems) for guidance. --- @@ -377,12 +377,12 @@ where possible. the filesystem provides it), which requires dropping out of rcu-walk mode. This may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be returned if the filesystem cannot handle rcu-walk. See -Documentation/filesystems/vfs.txt for more details. +Documentation/filesystems/vfs.rst for more details. permission is an inode permission check that is called on many or all directory inodes on the way down a path walk (to check for exec permission). It must now be rcu-walk aware (mask & MAY_NOT_BLOCK). See -Documentation/filesystems/vfs.txt for more details. +Documentation/filesystems/vfs.rst for more details. -- [mandatory] @@ -625,7 +625,7 @@ in your dentry operations instead. -- [mandatory] ->clone_file_range() and ->dedupe_file_range have been replaced with - ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + ->remap_file_range(). See Documentation/filesystems/vfs.rst for more information. -- [recommended] -- cgit From b640fbad2d8fe120c761f61eb6c96f05047100cd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 7 Jun 2019 15:54:36 -0300 Subject: docs: pci: fix broken links due to conversion from pci.txt to pci.rst Some documentation files were still pointing to the old place. Fixes: 229b4e0728e0 ("Documentation: PCI: convert pci.txt to reST") Signed-off-by: Mauro Carvalho Chehab Acked-by: Paul E. McKenney Signed-off-by: Jonathan Corbet --- Documentation/memory-barriers.txt | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f70ebcdfe592..f4170aae1d75 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -548,7 +548,7 @@ There are certain things that the Linux kernel memory barriers do not guarantee: [*] For information on bus mastering DMA and coherency please read: - Documentation/PCI/pci.txt + Documentation/PCI/pci.rst Documentation/DMA-API-HOWTO.txt Documentation/DMA-API.txt diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index db0b9d8619f1..07725b1df002 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -569,7 +569,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE [*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다: - Documentation/PCI/pci.txt + Documentation/PCI/pci.rst Documentation/DMA-API-HOWTO.txt Documentation/DMA-API.txt -- cgit From 12775af50549121ade8efbdcf3612b867188ea0b Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 6 Jun 2019 19:30:38 +0300 Subject: dt-bindings: doc: net: keystone-netcp: document cpts The Keystone 2 66AK2HK/E/L 1G Ethernet Switch Subsystems contains The Common Platform Time Sync (CPTS) module which is in general compatible with CPTS module found on "legacy" TI AM3/4/5 SoCs. So, the basic support for Keystone 2 CPTS is available by default, but not documented. The Keystone 2 CPTS module supports also some additional features like time sync reference (RFTCLK) clock selection through CPTS_RFTCLK_SEL register (offset: x08) in CPTS module, which is modelled as multiplexer clock. This patch adds missed binding documentation for Keystone 2 66AK2HK/E/L CPTS module. Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Signed-off-by: David S. Miller --- .../devicetree/bindings/net/keystone-netcp.txt | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt index 6262c2f293b0..24f11e042f8d 100644 --- a/Documentation/devicetree/bindings/net/keystone-netcp.txt +++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt @@ -104,6 +104,23 @@ Required properties: - 10Gb mac<->mac forced mode : 11 ----phy-handle: phandle to PHY device +- cpts: sub-node time synchronization (CPTS) submodule configuration +-- clocks: CPTS reference clock. Should point on cpts-refclk-mux clock. +-- clock-names: should be "cpts" +-- cpts-refclk-mux: multiplexer clock definition sub-node for CPTS reference (RFTCLK) clock +--- #clock-cells: should be 0 +--- clocks: list of CPTS reference (RFTCLK) clock's parents as defined in Data manual +--- ti,mux-tbl: array of multiplexer indexes as defined in Data manual +--- assigned-clocks: should point on cpts-refclk-mux clock +--- assigned-clock-parents: should point on required RFTCLK clock parent to be selected +-- cpts_clock_mult: (optional) Numerator to convert input clock ticks + into nanoseconds +-- cpts_clock_shift: (optional) Denominator to convert input clock ticks into + nanoseconds. + Mult and shift will be calculated basing on CPTS + rftclk frequency if both cpts_clock_shift and + cpts_clock_mult properties are not provided. + Optional properties: - enable-ale: NetCP driver keeps the address learning feature in the ethernet switch module disabled. This attribute is to enable the address @@ -168,6 +185,23 @@ netcp: netcp@2000000 { tx-queue = <648>; tx-channel = <8>; + cpts { + clocks = <&cpts_refclk_mux>; + clock-names = "cpts"; + + cpts_refclk_mux: cpts-refclk-mux { + #clock-cells = <0>; + clocks = <&chipclk12>, <&chipclk13>, + <&timi0>, <&timi1>, + <&tsipclka>, <&tsrefclk>, + <&tsipclkb>; + ti,mux-tbl = <0x0>, <0x1>, <0x2>, + <0x3>, <0x4>, <0x8>, <0xC>; + assigned-clocks = <&cpts_refclk_mux>; + assigned-clock-parents = <&chipclk12>; + }; + }; + interfaces { gbe0: interface-0 { slave-port = <0>; @@ -219,3 +253,13 @@ netcp: netcp@2000000 { }; }; }; + +CPTS board configuration - select external CPTS RFTCLK: + +&tsrefclk{ + clock-frequency = <500000000>; +}; + +&cpts_refclk_mux { + assigned-clock-parents = <&tsrefclk>; +}; -- cgit From e3630fd9aa2a11d0dd12b92e7d51299f9b547d64 Mon Sep 17 00:00:00 2001 From: Teresa Remmet Date: Fri, 24 May 2019 15:20:03 +0200 Subject: ARM: dts: Add support for phyBOARD-REGOR-AM335x Adding support for phyBOARD-REGOR-AM335x: - CAN - UART0 / UART2 - RS485 - USB device - i2c RTC - SD - ethernet0 RMII - ethernet1 MII - Digital I/Os Signed-off-by: Teresa Remmet Signed-off-by: Tony Lindgren --- Documentation/devicetree/bindings/arm/omap/omap.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt index 1c1e48fd94b5..b301f753ed2c 100644 --- a/Documentation/devicetree/bindings/arm/omap/omap.txt +++ b/Documentation/devicetree/bindings/arm/omap/omap.txt @@ -160,6 +160,9 @@ Boards: - AM335X phyCORE-AM335x: Development kit compatible = "phytec,am335x-pcm-953", "phytec,am335x-phycore-som", "ti,am33xx" +- AM335x phyBOARD-REGOR: Single Board Computer + compatible = "phytec,am335x-regor", "phytec,am335x-phycore-som", "ti,am33xx" + - AM335X UC-8100-ME-T: Communication-centric industrial computing platform compatible = "moxa,uc-8100-me-t", "ti,am33xx"; -- cgit From 5a46b6fa0f33c2ca7a8432a0a48e1bf4a71fcc9f Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 27 May 2019 14:43:05 +0200 Subject: dt-bindings: mmc: meson-gx: add dram-access-quirk property On the Amlogic G12A SoC family, (only) the SDIO controller has a bug which makes any DRAM access from the MMC controller fail. Add the amlogic,dram-access-quirk property so signal this particular controller has this bug and needs a quirk to work properly. Reviewed-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt index 13e70409e8ac..ccc5358db131 100644 --- a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt +++ b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt @@ -22,6 +22,10 @@ Required properties: clock rate requested by the MMC core. - resets : phandle of the internal reset line +Optional properties: +- amlogic,dram-access-quirk: set when controller's internal DMA engine cannot access the + DRAM memory, like on the G12A dedicated SDIO controller. + Example: sd_emmc_a: mmc@70000 { -- cgit From f9b7989859dd005ffcd8aea5a9c862f61cda9cd8 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 May 2019 09:23:43 +0200 Subject: dt-bindings: mmc: Add YAML schemas for the generic MMC options The MMC controllers have a bunch of generic options that are needed in a device tree. Add a YAML schemas for those. Reviewed-by: Rob Herring Signed-off-by: Maxime Ripard Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/mmc-controller.yaml | 374 +++++++++++++++++++++ Documentation/devicetree/bindings/mmc/mmc.txt | 178 +--------- 2 files changed, 375 insertions(+), 177 deletions(-) create mode 100644 Documentation/devicetree/bindings/mmc/mmc-controller.yaml (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml new file mode 100644 index 000000000000..080754e0ef35 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml @@ -0,0 +1,374 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mmc/mmc-controller.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MMC Controller Generic Binding + +maintainers: + - Ulf Hansson + +description: | + These properties are common to multiple MMC host controllers. Any host + that requires the respective functionality should implement them using + these definitions. + +properties: + $nodename: + pattern: "^mmc(@.*)?$" + + "#address-cells": + const: 1 + description: | + The cell is the slot ID if a function subnode is used. + + "#size-cells": + const: 0 + + # Card Detection. + # If none of these properties are supplied, the host native card + # detect will be used. Only one of them should be provided. + + broken-cd: + $ref: /schemas/types.yaml#/definitions/flag + description: + There is no card detection available; polling must be used. + + cd-gpios: + description: + The card detection will be done using the GPIO provided. + + non-removable: + $ref: /schemas/types.yaml#/definitions/flag + description: + Non-removable slot (like eMMC); assume always present. + + # *NOTE* on CD and WP polarity. To use common for all SD/MMC host + # controllers line polarity properties, we have to fix the meaning + # of the "normal" and "inverted" line levels. We choose to follow + # the SDHCI standard, which specifies both those lines as "active + # low." Therefore, using the "cd-inverted" property means, that the + # CD line is active high, i.e. it is high, when a card is + # inserted. Similar logic applies to the "wp-inverted" property. + # + # CD and WP lines can be implemented on the hardware in one of two + # ways: as GPIOs, specified in cd-gpios and wp-gpios properties, or + # as dedicated pins. Polarity of dedicated pins can be specified, + # using *-inverted properties. GPIO polarity can also be specified + # using the GPIO_ACTIVE_LOW flag. This creates an ambiguity in the + # latter case. We choose to use the XOR logic for GPIO CD and WP + # lines. This means, the two properties are "superimposed," for + # example leaving the GPIO_ACTIVE_LOW flag clear and specifying the + # respective *-inverted property property results in a + # double-inversion and actually means the "normal" line polarity is + # in effect. + wp-inverted: + $ref: /schemas/types.yaml#/definitions/flag + description: + The Write Protect line polarity is inverted. + + cd-inverted: + $ref: /schemas/types.yaml#/definitions/flag + description: + The CD line polarity is inverted. + + # Other properties + + bus-width: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - enum: [1, 4, 8] + default: 1 + description: + Number of data lines. + + max-frequency: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - minimum: 400000 + - maximum: 200000000 + description: + Maximum operating frequency of the bus. + + disable-wp: + $ref: /schemas/types.yaml#/definitions/flag + description: + When set, no physical write-protect line is present. This + property should only be specified when the controller has a + dedicated write-protect detection logic. If a GPIO is always + used for the write-protect detection. If a GPIO is always used + for the write-protect detection logic, it is sufficient to not + specify the wp-gpios property in the absence of a write-protect + line. + + wp-gpios: + description: + GPIO to use for the write-protect detection. + + cd-debounce-delay-ms: + description: + Set delay time before detecting card after card insert + interrupt. + + no-1-8-v: + $ref: /schemas/types.yaml#/definitions/flag + description: + When specified, denotes that 1.8V card voltage is not supported + on this system, even if the controller claims it. + + cap-sd-highspeed: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD high-speed timing is supported. + + cap-mmc-highspeed: + $ref: /schemas/types.yaml#/definitions/flag + description: + MMC high-speed timing is supported. + + sd-uhs-sdr12: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD UHS SDR12 speed is supported. + + sd-uhs-sdr25: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD UHS SDR25 speed is supported. + + sd-uhs-sdr50: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD UHS SDR50 speed is supported. + + sd-uhs-sdr104: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD UHS SDR104 speed is supported. + + sd-uhs-ddr50: + $ref: /schemas/types.yaml#/definitions/flag + description: + SD UHS DDR50 speed is supported. + + cap-power-off-card: + $ref: /schemas/types.yaml#/definitions/flag + description: + Powering off the card is safe. + + cap-mmc-hw-reset: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC hardware reset is supported + + cap-sdio-irq: + $ref: /schemas/types.yaml#/definitions/flag + description: + enable SDIO IRQ signalling on this interface + + full-pwr-cycle: + $ref: /schemas/types.yaml#/definitions/flag + description: + Full power cycle of the card is supported. + + mmc-ddr-1_2v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC high-speed DDR mode (1.2V I/O) is supported. + + mmc-ddr-1_8v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC high-speed DDR mode (1.8V I/O) is supported. + + mmc-ddr-3_3v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC high-speed DDR mode (3.3V I/O) is supported. + + mmc-hs200-1_2v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC HS200 mode (1.2V I/O) is supported. + + mmc-hs200-1_8v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC HS200 mode (1.8V I/O) is supported. + + mmc-hs400-1_2v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC HS400 mode (1.2V I/O) is supported. + + mmc-hs400-1_8v: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC HS400 mode (1.8V I/O) is supported. + + mmc-hs400-enhanced-strobe: + $ref: /schemas/types.yaml#/definitions/flag + description: + eMMC HS400 enhanced strobe mode is supported + + dsr: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - minimum: 0 + - maximum: 0xffff + description: + Value the card Driver Stage Register (DSR) should be programmed + with. + + no-sdio: + $ref: /schemas/types.yaml#/definitions/flag + description: + Controller is limited to send SDIO commands during + initialization. + + no-sd: + $ref: /schemas/types.yaml#/definitions/flag + description: + Controller is limited to send SD commands during initialization. + + no-mmc: + $ref: /schemas/types.yaml#/definitions/flag + description: + Controller is limited to send MMC commands during + initialization. + + fixed-emmc-driver-type: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - minimum: 0 + - maximum: 4 + description: + For non-removable eMMC, enforce this driver type. The value is + the driver type as specified in the eMMC specification (table + 206 in spec version 5.1) + + post-power-on-delay-ms: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32 + - default: 10 + description: + It was invented for MMC pwrseq-simple which could be referred to + mmc-pwrseq-simple.txt. But now it\'s reused as a tunable delay + waiting for I/O signalling and card power supply to be stable, + regardless of whether pwrseq-simple is used. Default to 10ms if + no available. + + supports-cqe: + $ref: /schemas/types.yaml#/definitions/flag + description: + The presence of this property indicates that the corresponding + MMC host controller supports HW command queue feature. + + disable-cqe-dcmd: + $ref: /schemas/types.yaml#/definitions/flag + description: + The presence of this property indicates that the MMC + controller\'s command queue engine (CQE) does not support direct + commands (DCMDs). + + keep-power-in-suspend: + $ref: /schemas/types.yaml#/definitions/flag + description: + SDIO only. Preserves card power during a suspend/resume cycle. + + # Deprecated: enable-sdio-wakeup + wakeup-source: + $ref: /schemas/types.yaml#/definitions/flag + description: + SDIO only. Enables wake up of host system on SDIO IRQ assertion. + + vmmc-supply: + description: + Supply for the card power + + vqmmc-supply: + description: + Supply for the bus IO line power + + mmc-pwrseq: + $ref: /schemas/types.yaml#/definitions/phandle + description: + System-on-Chip designs may specify a specific MMC power + sequence. To successfully detect an (e)MMC/SD/SDIO card, that + power sequence must be maintained while initializing the card. + +patternProperties: + "^.*@[0-9]+$": + type: object + description: | + On embedded systems the cards connected to a host may need + additional properties. These can be specified in subnodes to the + host controller node. The subnodes are identified by the + standard \'reg\' property. Which information exactly can be + specified depends on the bindings for the SDIO function driver + for the subnode, as specified by the compatible string. + + properties: + compatible: + description: | + Name of SDIO function following generic names recommended + practice + + reg: + items: + - minimum: 0 + maximum: 7 + description: + Must contain the SDIO function number of the function this + subnode describes. A value of 0 denotes the memory SD + function, values from 1 to 7 denote the SDIO functions. + + broken-hpi: + $ref: /schemas/types.yaml#/definitions/flag + description: + Use this to indicate that the mmc-card has a broken hpi + implementation, and that hpi should not be used. + + required: + - reg + +dependencies: + cd-debounce-delay-ms: [ cd-gpios ] + fixed-emmc-driver-type: [ non-removable ] + +examples: + - | + sdhci@ab000000 { + compatible = "sdhci"; + reg = <0xab000000 0x200>; + interrupts = <23>; + bus-width = <4>; + cd-gpios = <&gpio 69 0>; + cd-inverted; + wp-gpios = <&gpio 70 0>; + max-frequency = <50000000>; + keep-power-in-suspend; + wakeup-source; + mmc-pwrseq = <&sdhci0_pwrseq>; + }; + + - | + mmc3: mmc@1c12000 { + #address-cells = <1>; + #size-cells = <0>; + pinctrl-names = "default"; + pinctrl-0 = <&mmc3_pins_a>; + vmmc-supply = <®_vmmc3>; + bus-width = <4>; + non-removable; + mmc-pwrseq = <&sdhci0_pwrseq>; + + brcmf: bcrmf@1 { + reg = <1>; + compatible = "brcm,bcm43xx-fmac"; + interrupt-parent = <&pio>; + interrupts = <10 8>; + interrupt-names = "host-wake"; + }; + }; diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt index c269dbe384fe..bf9d7d3febf1 100644 --- a/Documentation/devicetree/bindings/mmc/mmc.txt +++ b/Documentation/devicetree/bindings/mmc/mmc.txt @@ -1,177 +1 @@ -These properties are common to multiple MMC host controllers. Any host -that requires the respective functionality should implement them using -these definitions. - -Interpreted by the OF core: -- reg: Registers location and length. -- interrupts: Interrupts used by the MMC controller. - -Card detection: -If no property below is supplied, host native card detect is used. -Only one of the properties in this section should be supplied: - - broken-cd: There is no card detection available; polling must be used. - - cd-gpios: Specify GPIOs for card detection, see gpio binding - - non-removable: non-removable slot (like eMMC); assume always present. - -Optional properties: -- bus-width: Number of data lines, can be <1>, <4>, or <8>. The default - will be <1> if the property is absent. -- wp-gpios: Specify GPIOs for write protection, see gpio binding -- cd-inverted: when present, polarity on the CD line is inverted. See the note - below for the case, when a GPIO is used for the CD line -- cd-debounce-delay-ms: Set delay time before detecting card after card insert interrupt. - It's only valid when cd-gpios is present. -- wp-inverted: when present, polarity on the WP line is inverted. See the note - below for the case, when a GPIO is used for the WP line -- disable-wp: When set no physical WP line is present. This property should - only be specified when the controller has a dedicated write-protect - detection logic. If a GPIO is always used for the write-protect detection - logic it is sufficient to not specify wp-gpios property in the absence of a WP - line. -- max-frequency: maximum operating clock frequency -- no-1-8-v: when present, denotes that 1.8v card voltage is not supported on - this system, even if the controller claims it is. -- cap-sd-highspeed: SD high-speed timing is supported -- cap-mmc-highspeed: MMC high-speed timing is supported -- sd-uhs-sdr12: SD UHS SDR12 speed is supported -- sd-uhs-sdr25: SD UHS SDR25 speed is supported -- sd-uhs-sdr50: SD UHS SDR50 speed is supported -- sd-uhs-sdr104: SD UHS SDR104 speed is supported -- sd-uhs-ddr50: SD UHS DDR50 speed is supported -- cap-power-off-card: powering off the card is safe -- cap-mmc-hw-reset: eMMC hardware reset is supported -- cap-sdio-irq: enable SDIO IRQ signalling on this interface -- full-pwr-cycle: full power cycle of the card is supported -- mmc-ddr-3_3v: eMMC high-speed DDR mode(3.3V I/O) is supported -- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported -- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported -- mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported -- mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported -- mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported -- mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported -- mmc-hs400-enhanced-strobe: eMMC HS400 enhanced strobe mode is supported -- dsr: Value the card's (optional) Driver Stage Register (DSR) should be - programmed with. Valid range: [0 .. 0xffff]. -- no-sdio: controller is limited to send sdio cmd during initialization -- no-sd: controller is limited to send sd cmd during initialization -- no-mmc: controller is limited to send mmc cmd during initialization -- fixed-emmc-driver-type: for non-removable eMMC, enforce this driver type. - The value is the driver type as specified in the eMMC specification - (table 206 in spec version 5.1). -- post-power-on-delay-ms : It was invented for MMC pwrseq-simple which could - be referred to mmc-pwrseq-simple.txt. But now it's reused as a tunable delay - waiting for I/O signalling and card power supply to be stable, regardless of - whether pwrseq-simple is used. Default to 10ms if no available. -- supports-cqe : The presence of this property indicates that the corresponding - MMC host controller supports HW command queue feature. -- disable-cqe-dcmd: This property indicates that the MMC controller's command - queue engine (CQE) does not support direct commands (DCMDs). - -*NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line -polarity properties, we have to fix the meaning of the "normal" and "inverted" -line levels. We choose to follow the SDHCI standard, which specifies both those -lines as "active low." Therefore, using the "cd-inverted" property means, that -the CD line is active high, i.e. it is high, when a card is inserted. Similar -logic applies to the "wp-inverted" property. - -CD and WP lines can be implemented on the hardware in one of two ways: as GPIOs, -specified in cd-gpios and wp-gpios properties, or as dedicated pins. Polarity of -dedicated pins can be specified, using *-inverted properties. GPIO polarity can -also be specified using the GPIO_ACTIVE_LOW flag. This creates an ambiguity -in the latter case. We choose to use the XOR logic for GPIO CD and WP lines. -This means, the two properties are "superimposed," for example leaving the -GPIO_ACTIVE_LOW flag clear and specifying the respective *-inverted property -property results in a double-inversion and actually means the "normal" line -polarity is in effect. - -Optional SDIO properties: -- keep-power-in-suspend: Preserves card power during a suspend/resume cycle -- wakeup-source: Enables wake up of host system on SDIO IRQ assertion - (Legacy property supported: "enable-sdio-wakeup") - -MMC power ---------- - -Controllers may implement power control from both the connected cards and -the IO signaling (for example to change to high-speed 1.8V signalling). If -the system supports this, then the following two properties should point -to valid regulator nodes: - -- vqmmc-supply: supply node for IO line power -- vmmc-supply: supply node for card's power - - -MMC power sequences: --------------------- - -System on chip designs may specify a specific MMC power sequence. To -successfully detect an (e)MMC/SD/SDIO card, that power sequence must be -maintained while initializing the card. - -Optional property: -- mmc-pwrseq: phandle to the MMC power sequence node. See "mmc-pwrseq-*" - for documentation of MMC power sequence bindings. - - -Use of Function subnodes ------------------------- - -On embedded systems the cards connected to a host may need additional -properties. These can be specified in subnodes to the host controller node. -The subnodes are identified by the standard 'reg' property. -Which information exactly can be specified depends on the bindings for the -SDIO function driver for the subnode, as specified by the compatible string. - -Required host node properties when using function subnodes: -- #address-cells: should be one. The cell is the slot id. -- #size-cells: should be zero. - -Required function subnode properties: -- reg: Must contain the SDIO function number of the function this subnode - describes. A value of 0 denotes the memory SD function, values from - 1 to 7 denote the SDIO functions. - -Optional function subnode properties: -- compatible: name of SDIO function following generic names recommended practice - - -Examples --------- - -Basic example: - -sdhci@ab000000 { - compatible = "sdhci"; - reg = <0xab000000 0x200>; - interrupts = <23>; - bus-width = <4>; - cd-gpios = <&gpio 69 0>; - cd-inverted; - wp-gpios = <&gpio 70 0>; - max-frequency = <50000000>; - keep-power-in-suspend; - wakeup-source; - mmc-pwrseq = <&sdhci0_pwrseq> -} - -Example with sdio function subnode: - -mmc3: mmc@1c12000 { - #address-cells = <1>; - #size-cells = <0>; - - pinctrl-names = "default"; - pinctrl-0 = <&mmc3_pins_a>; - vmmc-supply = <®_vmmc3>; - bus-width = <4>; - non-removable; - mmc-pwrseq = <&sdhci0_pwrseq> - - brcmf: bcrmf@1 { - reg = <1>; - compatible = "brcm,bcm43xx-fmac"; - interrupt-parent = <&pio>; - interrupts = <10 8>; /* PH10 / EINT10 */ - interrupt-names = "host-wake"; - }; -}; +This file has moved to mmc-controller.yaml. -- cgit From ca4570a4c2d3be309ff4b8976efa7c8291ca6b6d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 May 2019 09:23:44 +0200 Subject: dt-bindings: mmc: sun4i: Add YAML schemas Switch the DT binding to a YAML schema to enable the DT validation. Reviewed-by: Rob Herring Signed-off-by: Maxime Ripard Signed-off-by: Ulf Hansson --- .../bindings/mmc/allwinner,sun4i-a10-mmc.yaml | 98 ++++++++++++++++++++++ .../devicetree/bindings/mmc/sunxi-mmc.txt | 52 ------------ 2 files changed, 98 insertions(+), 52 deletions(-) create mode 100644 Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml delete mode 100644 Documentation/devicetree/bindings/mmc/sunxi-mmc.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml new file mode 100644 index 000000000000..df0280edef97 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mmc/allwinner,sun4i-a10-mmc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 MMC Controller Device Tree Bindings + +allOf: + - $ref: "mmc-controller.yaml" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#address-cells": true + "#size-cells": true + + compatible: + oneOf: + - const: allwinner,sun4i-a10-mmc + - const: allwinner,sun5i-a13-mmc + - const: allwinner,sun7i-a20-mmc + - const: allwinner,sun8i-a83t-emmc + - const: allwinner,sun9i-a80-mmc + - const: allwinner,sun50i-a64-emmc + - const: allwinner,sun50i-a64-mmc + - items: + - const: allwinner,sun8i-a83t-mmc + - const: allwinner,sun7i-a20-mmc + - items: + - const: allwinner,sun50i-h6-emmc + - const: allwinner,sun50i-a64-emmc + - items: + - const: allwinner,sun50i-h6-mmc + - const: allwinner,sun50i-a64-mmc + - items: + - const: allwinner,sun8i-r40-emmc + - const: allwinner,sun50i-a64-emmc + - items: + - const: allwinner,sun8i-r40-mmc + - const: allwinner,sun50i-a64-mmc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 2 + maxItems: 4 + items: + - description: Bus Clock + - description: Module Clock + - description: Output Clock + - description: Sample Clock + + clock-names: + minItems: 2 + maxItems: 4 + items: + - const: ahb + - const: mmc + - const: output + - const: sample + + resets: + maxItems: 1 + + reset-names: + const: ahb + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +examples: + - | + mmc0: mmc@1c0f000 { + compatible = "allwinner,sun5i-a13-mmc"; + reg = <0x01c0f000 0x1000>; + clocks = <&ahb_gates 8>, <&mmc0_clk>; + clock-names = "ahb", "mmc"; + interrupts = <32>; + bus-width = <4>; + cd-gpios = <&pio 7 1 0>; + }; + +# FIXME: We should set it, but it would report all the generic +# properties as additional properties. +# additionalProperties: false + +... diff --git a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt b/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt deleted file mode 100644 index e9cb3ec5e502..000000000000 --- a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt +++ /dev/null @@ -1,52 +0,0 @@ -* Allwinner sunxi MMC controller - -The highspeed MMC host controller on Allwinner SoCs provides an interface -for MMC, SD and SDIO types of memory cards. - -Supported maximum speeds are the ones of the eMMC standard 4.5 as well -as the speed of SD standard 3.0. -Absolute maximum transfer rate is 200MB/s - -Required properties: - - compatible : should be one of: - * "allwinner,sun4i-a10-mmc" - * "allwinner,sun5i-a13-mmc" - * "allwinner,sun7i-a20-mmc" - * "allwinner,sun8i-a83t-emmc" - * "allwinner,sun9i-a80-mmc" - * "allwinner,sun50i-a64-emmc" - * "allwinner,sun50i-a64-mmc" - * "allwinner,sun50i-h6-emmc", "allwinner.sun50i-a64-emmc" - * "allwinner,sun50i-h6-mmc", "allwinner.sun50i-a64-mmc" - - reg : mmc controller base registers - - clocks : a list with 4 phandle + clock specifier pairs - - clock-names : must contain "ahb", "mmc", "output" and "sample" - - interrupts : mmc controller interrupt - -Optional properties: - - resets : phandle + reset specifier pair - - reset-names : must contain "ahb" - - for cd, bus-width and additional generic mmc parameters - please refer to mmc.txt within this directory - -Examples: - - Within .dtsi: - mmc0: mmc@1c0f000 { - compatible = "allwinner,sun5i-a13-mmc"; - reg = <0x01c0f000 0x1000>; - clocks = <&ahb_gates 8>, <&mmc0_clk>, <&mmc0_output_clk>, <&mmc0_sample_clk>; - clock-names = "ahb", "mod", "output", "sample"; - interrupts = <0 32 4>; - status = "disabled"; - }; - - - Within dts: - mmc0: mmc@1c0f000 { - pinctrl-names = "default", "default"; - pinctrl-0 = <&mmc0_pins_a>; - pinctrl-1 = <&mmc0_cd_pin_reference_design>; - bus-width = <4>; - cd-gpios = <&pio 7 1 0>; /* PH1 */ - cd-inverted; - status = "okay"; - }; -- cgit From 334eb9bcb94f6e02d96268b76dd270a399643176 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 4 Jun 2019 16:14:22 +0800 Subject: dt-bindings: mmc: sprd: Add another optional clock documentation For some Spreadtrum platforms like SC9860 platform, we should enable another gate clock '2x_enable' to make the SD host controller work well. Thus add documentation for this optional clock. Signed-off-by: Baolin Wang Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-sprd.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt index 45c9978aad7b..a285c773acd0 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt @@ -14,6 +14,7 @@ Required properties: - clock-names: Should contain the following: "sdio" - SDIO source clock (required) "enable" - gate clock which used for enabling/disabling the device (required) + "2x_enable" - gate clock controlling the device for some special platforms (optional) Optional properties: - assigned-clocks: the same with "sdio" clock -- cgit From c8ff5351b598cd722c9cf8a84259887bd1af2e0e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 4 Jun 2019 16:14:27 +0800 Subject: dt-bindings: mmc: sprd: Add PHY DLL delay documentation Introduce some PHY DLL delays properties to help to sample the PHY clock. Signed-off-by: Baolin Wang Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-sprd.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt index a285c773acd0..e675397fa428 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt @@ -20,6 +20,23 @@ Optional properties: - assigned-clocks: the same with "sdio" clock - assigned-clock-parents: the default parent of "sdio" clock +PHY DLL delays are used to delay the data valid window, and align the window +to sampling clock. PHY DLL delays can be configured by following properties, +and each property contains 4 cells which are used to configure the clock data +write line delay value, clock read command line delay value, clock read data +positive edge delay value and clock read data negative edge delay value. +Each cell's delay value unit is cycle of the PHY clock. + +- sprd,phy-delay-legacy: Delay value for legacy timing. +- sprd,phy-delay-sd-highspeed: Delay value for SD high-speed timing. +- sprd,phy-delay-sd-uhs-sdr50: Delay value for SD UHS SDR50 timing. +- sprd,phy-delay-sd-uhs-sdr104: Delay value for SD UHS SDR50 timing. +- sprd,phy-delay-mmc-highspeed: Delay value for MMC high-speed timing. +- sprd,phy-delay-mmc-ddr52: Delay value for MMC DDR52 timing. +- sprd,phy-delay-mmc-hs200: Delay value for MMC HS200 timing. +- sprd,phy-delay-mmc-hs400: Delay value for MMC HS400 timing. +- sprd,phy-delay-mmc-hs400es: Delay value for MMC HS400 enhanced strobe timing. + Examples: sdio0: sdio@20600000 { @@ -33,6 +50,7 @@ sdio0: sdio@20600000 { assigned-clocks = <&ap_clk CLK_EMMC_2X>; assigned-clock-parents = <&rpll CLK_RPLL_390M>; + sprd,phy-delay-sd-uhs-sdr104 = <0x3f 0x7f 0x2e 0x2e>; bus-width = <8>; non-removable; no-sdio; -- cgit From 51bd6f291583684f495ea498984dfc22049d7fd2 Mon Sep 17 00:00:00 2001 From: Asmaa Mnebhi Date: Mon, 10 Jun 2019 14:57:02 -0400 Subject: Add support for IPMB driver Support receiving IPMB requests on a Satellite MC from the BMC. Once a response is ready, this driver will send back a response to the BMC via the IPMB channel. Signed-off-by: Asmaa Mnebhi Acked-by: vadimp@mellanox.com Message-Id: <319690553a0da2a1e80b400941341081b383e5f1.1560192707.git.Asmaa@mellanox.com> [Move the config option to outside the ipmi msghandler, as it's not dependent on that. Fixed one small whitespace issue.] Signed-off-by: Corey Minyard --- Documentation/IPMB.txt | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 Documentation/IPMB.txt (limited to 'Documentation') diff --git a/Documentation/IPMB.txt b/Documentation/IPMB.txt new file mode 100644 index 000000000000..a6ed8b68bd0f --- /dev/null +++ b/Documentation/IPMB.txt @@ -0,0 +1,103 @@ +============================== +IPMB Driver for a Satellite MC +============================== + +The Intelligent Platform Management Bus or IPMB, is an +I2C bus that provides a standardized interconnection between +different boards within a chassis. This interconnection is +between the baseboard management (BMC) and chassis electronics. +IPMB is also associated with the messaging protocol through the +IPMB bus. + +The devices using the IPMB are usually management +controllers that perform management functions such as servicing +the front panel interface, monitoring the baseboard, +hot-swapping disk drivers in the system chassis, etc... + +When an IPMB is implemented in the system, the BMC serves as +a controller to give system software access to the IPMB. The BMC +sends IPMI requests to a device (usually a Satellite Management +Controller or Satellite MC) via IPMB and the device +sends a response back to the BMC. + +For more information on IPMB and the format of an IPMB message, +refer to the IPMB and IPMI specifications. + +IPMB driver for Satellite MC +---------------------------- + +ipmb-dev-int - This is the driver needed on a Satellite MC to +receive IPMB messages from a BMC and send a response back. +This driver works with the I2C driver and a userspace +program such as OpenIPMI: + +1) It is an I2C slave backend driver. So, it defines a callback +function to set the Satellite MC as an I2C slave. +This callback function handles the received IPMI requests. + +2) It defines the read and write functions to enable a user +space program (such as OpenIPMI) to communicate with the kernel. + + +Load the IPMB driver +-------------------- + +The driver needs to be loaded at boot time or manually first. +First, make sure you have the following in your config file: +CONFIG_IPMB_DEVICE_INTERFACE=y + +1) If you want the driver to be loaded at boot time: + +a) Add this entry to your ACPI table, under the appropriate SMBus: + +Device (SMB0) // Example SMBus host controller +{ + Name (_HID, "") // Vendor-Specific HID + Name (_UID, 0) // Unique ID of particular host controller + : + : + Device (IPMB) + { + Name (_HID, "IPMB0001") // IPMB device interface + Name (_UID, 0) // Unique device identifier + } +} + +b) Example for device tree: + +&i2c2 { + status = "okay"; + + ipmb@10 { + compatible = "ipmb-dev"; + reg = <0x10>; + }; +}; + +2) Manually from Linux: +modprobe ipmb-dev-int + + +Instantiate the device +---------------------- + +After loading the driver, you can instantiate the device as +described in 'Documentation/i2c/instantiating-devices'. +If you have multiple BMCs, each connected to your Satellite MC via +a different I2C bus, you can instantiate a device for each of +those BMCs. +The name of the instantiated device contains the I2C bus number +associated with it as follows: + +BMC1 ------ IPMB/I2C bus 1 ---------| /dev/ipmb-1 + Satellite MC +BMC1 ------ IPMB/I2C bus 2 ---------| /dev/ipmb-2 + +For instance, you can instantiate the ipmb-dev-int device from +user space at the 7 bit address 0x10 on bus 2: + + # echo ipmb-dev 0x1010 > /sys/bus/i2c/devices/i2c-2/new_device + +This will create the device file /dev/ipmb-2, which can be accessed +by the user space program. The device needs to be instantiated +before running the user space program. -- cgit From dc3988f40fdf7a51bd5480c3383372f463e4dfa9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:15 -0300 Subject: docs: Debugging390.txt: convert table to ascii artwork The first bit/value table inside the document is very hard to read and won't fit ReST format. Also, some columns aren't properly aligned. Convert it to a nice ascii artwork table with makes it easier to read as plain text and is compatible with ReST format parser on Sphinx. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- Documentation/s390/Debugging390.txt | 210 ++++++++++++++++++++---------------- 1 file changed, 120 insertions(+), 90 deletions(-) (limited to 'Documentation') diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt index 5ae7f868a007..c35804c238ad 100644 --- a/Documentation/s390/Debugging390.txt +++ b/Documentation/s390/Debugging390.txt @@ -78,96 +78,126 @@ e.g. switching address translation off requires that you have a logical=physical mapping for the address you are currently running at. - Bit Value -s/390 z/Architecture -0 0 Reserved ( must be 0 ) otherwise specification exception occurs. - -1 1 Program Event Recording 1 PER enabled, - PER is used to facilitate debugging e.g. single stepping. - -2-4 2-4 Reserved ( must be 0 ). - -5 5 Dynamic address translation 1=DAT on. - -6 6 Input/Output interrupt Mask - -7 7 External interrupt Mask used primarily for interprocessor - signalling and clock interrupts. - -8-11 8-11 PSW Key used for complex memory protection mechanism - (not used under linux) - -12 12 1 on s/390 0 on z/Architecture - -13 13 Machine Check Mask 1=enable machine check interrupts - -14 14 Wait State. Set this to 1 to stop the processor except for - interrupts and give time to other LPARS. Used in CPU idle in - the kernel to increase overall usage of processor resources. - -15 15 Problem state ( if set to 1 certain instructions are disabled ) - all linux user programs run with this bit 1 - ( useful info for debugging under VM ). - -16-17 16-17 Address Space Control - - 00 Primary Space Mode: - The register CR1 contains the primary address-space control ele- - ment (PASCE), which points to the primary space region/segment - table origin. - - 01 Access register mode - - 10 Secondary Space Mode: - The register CR7 contains the secondary address-space control - element (SASCE), which points to the secondary space region or - segment table origin. - - 11 Home Space Mode: - The register CR13 contains the home space address-space control - element (HASCE), which points to the home space region/segment - table origin. - - See "Address Spaces on Linux for s/390 & z/Architecture" below - for more information about address space usage in Linux. - -18-19 18-19 Condition codes (CC) - -20 20 Fixed point overflow mask if 1=FPU exceptions for this event - occur ( normally 0 ) - -21 21 Decimal overflow mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -22 22 Exponent underflow mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -23 23 Significance Mask if 1=FPU exceptions for this event occur - ( normally 0 ) - -24-31 24-30 Reserved Must be 0. - - 31 Extended Addressing Mode - 32 Basic Addressing Mode - Used to set addressing mode - PSW 31 PSW 32 - 0 0 24 bit - 0 1 31 bit - 1 1 64 bit - -32 1=31 bit addressing mode 0=24 bit addressing mode (for backward - compatibility), linux always runs with this bit set to 1 - -33-64 Instruction address. - 33-63 Reserved must be 0 - 64-127 Address - In 24 bits mode bits 64-103=0 bits 104-127 Address - In 31 bits mode bits 64-96=0 bits 97-127 Address - Note: unlike 31 bit mode on s/390 bit 96 must be zero - when loading the address with LPSWE otherwise a - specification exception occurs, LPSW is fully backward - compatible. - ++-------------------------+-------------------------------------------------+ +| Bit | | ++--------+----------------+ Value | +| s/390 | z/Architecture | | ++========+================+=================================================+ +| 0 | 0 | Reserved (must be 0) otherwise specification | +| | | exception occurs. | ++--------+----------------+-------------------------------------------------+ +| 1 | 1 | Program Event Recording 1 PER enabled, | +| | | PER is used to facilitate debugging e.g. | +| | | single stepping. | ++--------+----------------+-------------------------------------------------+ +| 2-4 | 2-4 | Reserved (must be 0). | ++--------+----------------+-------------------------------------------------+ +| 5 | 5 | Dynamic address translation 1=DAT on. | ++--------+----------------+-------------------------------------------------+ +| 6 | 6 | Input/Output interrupt Mask | ++--------+----------------+-------------------------------------------------+ +| 7 | 7 | External interrupt Mask used primarily for | +| | | interprocessor signalling and clock interrupts. | ++--------+----------------+-------------------------------------------------+ +| 8-11 | 8-11 | PSW Key used for complex memory protection | +| | | mechanism (not used under linux) | ++--------+----------------+-------------------------------------------------+ +| 12 | 12 | 1 on s/390 0 on z/Architecture | ++--------+----------------+-------------------------------------------------+ +| 13 | 13 | Machine Check Mask 1=enable machine check | +| | | interrupts | ++--------+----------------+-------------------------------------------------+ +| 14 | 14 | Wait State. Set this to 1 to stop the processor | +| | | except for interrupts and give time to other | +| | | LPARS. Used in CPU idle in the kernel to | +| | | increase overall usage of processor resources. | ++--------+----------------+-------------------------------------------------+ +| 15 | 15 | Problem state (if set to 1 certain instructions | +| | | are disabled). All linux user programs run with | +| | | this bit 1 (useful info for debugging under VM).| ++--------+----------------+-------------------------------------------------+ +| 16-17 | 16-17 | Address Space Control | +| | | | +| | | 00 Primary Space Mode: | +| | | | +| | | The register CR1 contains the primary | +| | | address-space control element (PASCE), which | +| | | points to the primary space region/segment | +| | | table origin. | +| | | | +| | | 01 Access register mode | +| | | | +| | | 10 Secondary Space Mode: | +| | | | +| | | The register CR7 contains the secondary | +| | | address-space control element (SASCE), which | +| | | points to the secondary space region or | +| | | segment table origin. | +| | | | +| | | 11 Home Space Mode: | +| | | | +| | | The register CR13 contains the home space | +| | | address-space control element (HASCE), which | +| | | points to the home space region/segment | +| | | table origin. | +| | | | +| | | See "Address Spaces on Linux for s/390 & | +| | | z/Architecture" below for more information | +| | | about address space usage in Linux. | ++--------+----------------+-------------------------------------------------+ +| 18-19 | 18-19 | Condition codes (CC) | ++--------+----------------+-------------------------------------------------+ +| 20 | 20 | Fixed point overflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 21 | 21 | Decimal overflow mask if 1=FPU exceptions for | +| | | this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 22 | 22 | Exponent underflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 23 | 23 | Significance Mask if 1=FPU exceptions for this | +| | | event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 24-31 | 24-30 | Reserved Must be 0. | +| +----------------+-------------------------------------------------+ +| | 31 | Extended Addressing Mode | +| +----------------+-------------------------------------------------+ +| | 32 | Basic Addressing Mode | +| | | | +| | | Used to set addressing mode | +| | | | +| | | +---------+----------+----------+ | +| | | | PSW 31 | PSW 32 | | | +| | | +---------+----------+----------+ | +| | | | 0 | 0 | 24 bit | | +| | | +---------+----------+----------+ | +| | | | 0 | 1 | 31 bit | | +| | | +---------+----------+----------+ | +| | | | 1 | 1 | 64 bit | | +| | | +---------+----------+----------+ | ++--------+----------------+-------------------------------------------------+ +| 32 | | 1=31 bit addressing mode 0=24 bit addressing | +| | | mode (for backward compatibility), linux | +| | | always runs with this bit set to 1 | ++--------+----------------+-------------------------------------------------+ +| 33-64 | | Instruction address. | +| +----------------+-------------------------------------------------+ +| | 33-63 | Reserved must be 0 | +| +----------------+-------------------------------------------------+ +| | 64-127 | Address | +| | | | +| | | - In 24 bits mode bits 64-103=0 bits 104-127 | +| | | Address | +| | | - In 31 bits mode bits 64-96=0 bits 97-127 | +| | | Address | +| | | | +| | | Note: | +| | | unlike 31 bit mode on s/390 bit 96 must be | +| | | zero when loading the address with LPSWE | +| | | otherwise a specification exception occurs, | +| | | LPSW is fully backward compatible. | ++--------+----------------+-------------------------------------------------+ Prefix Page(s) -------------- -- cgit From 8b4a503d659b32cae8266aeb306f7fd6717e6a53 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:16 -0300 Subject: docs: s390: convert docs to ReST and rename to *.rst Convert all text files with s390 documentation to ReST format. Tried to preserve as much as possible the original document format. Still, some of the files required some work in order for it to be visible on both plain text and after converted to html. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- Documentation/admin-guide/kernel-parameters.txt | 4 +- Documentation/driver-api/s390-drivers.rst | 4 +- Documentation/s390/3270.rst | 298 +++ Documentation/s390/3270.txt | 271 --- Documentation/s390/CommonIO | 125 -- Documentation/s390/DASD | 73 - Documentation/s390/Debugging390.txt | 2172 ------------------- Documentation/s390/cds.rst | 530 +++++ Documentation/s390/cds.txt | 472 ---- Documentation/s390/common_io.rst | 140 ++ Documentation/s390/dasd.rst | 84 + Documentation/s390/debugging390.rst | 2613 +++++++++++++++++++++++ Documentation/s390/driver-model.rst | 328 +++ Documentation/s390/driver-model.txt | 287 --- Documentation/s390/index.rst | 30 + Documentation/s390/monreader.rst | 212 ++ Documentation/s390/monreader.txt | 197 -- Documentation/s390/qeth.rst | 64 + Documentation/s390/qeth.txt | 50 - Documentation/s390/s390dbf.rst | 803 +++++++ Documentation/s390/s390dbf.txt | 667 ------ Documentation/s390/text_files.rst | 11 + Documentation/s390/vfio-ap.rst | 866 ++++++++ Documentation/s390/vfio-ap.txt | 837 -------- Documentation/s390/vfio-ccw.rst | 326 +++ Documentation/s390/vfio-ccw.txt | 300 --- Documentation/s390/zfcpdump.rst | 50 + Documentation/s390/zfcpdump.txt | 48 - 28 files changed, 6359 insertions(+), 5503 deletions(-) create mode 100644 Documentation/s390/3270.rst delete mode 100644 Documentation/s390/3270.txt delete mode 100644 Documentation/s390/CommonIO delete mode 100644 Documentation/s390/DASD delete mode 100644 Documentation/s390/Debugging390.txt create mode 100644 Documentation/s390/cds.rst delete mode 100644 Documentation/s390/cds.txt create mode 100644 Documentation/s390/common_io.rst create mode 100644 Documentation/s390/dasd.rst create mode 100644 Documentation/s390/debugging390.rst create mode 100644 Documentation/s390/driver-model.rst delete mode 100644 Documentation/s390/driver-model.txt create mode 100644 Documentation/s390/index.rst create mode 100644 Documentation/s390/monreader.rst delete mode 100644 Documentation/s390/monreader.txt create mode 100644 Documentation/s390/qeth.rst delete mode 100644 Documentation/s390/qeth.txt create mode 100644 Documentation/s390/s390dbf.rst delete mode 100644 Documentation/s390/s390dbf.txt create mode 100644 Documentation/s390/text_files.rst create mode 100644 Documentation/s390/vfio-ap.rst delete mode 100644 Documentation/s390/vfio-ap.txt create mode 100644 Documentation/s390/vfio-ccw.rst delete mode 100644 Documentation/s390/vfio-ccw.txt create mode 100644 Documentation/s390/zfcpdump.rst delete mode 100644 Documentation/s390/zfcpdump.txt (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b9b0623be925 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -478,7 +478,7 @@ others). ccw_timeout_log [S390] - See Documentation/s390/CommonIO for details. + See Documentation/s390/common_io.rst for details. cgroup_disable= [KNL] Disable a particular controller Format: {name of the controller(s) to disable} @@ -516,7 +516,7 @@ /selinux/checkreqprot. cio_ignore= [S390] - See Documentation/s390/CommonIO for details. + See Documentation/s390/common_io.rst for details. clk_ignore_unused [CLK] Prevents the clock framework from automatically gating diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst index 30e6aa7e160b..5158577bc29b 100644 --- a/Documentation/driver-api/s390-drivers.rst +++ b/Documentation/driver-api/s390-drivers.rst @@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well, although they are not the focus of this document. Some additional information can also be found in the kernel source under -Documentation/s390/driver-model.txt. +Documentation/s390/driver-model.rst. The css bus =========== @@ -38,7 +38,7 @@ into several categories: * Standard I/O subchannels, for use by the system. They have a child device on the ccw bus and are described below. * I/O subchannels bound to the vfio-ccw driver. See - Documentation/s390/vfio-ccw.txt. + Documentation/s390/vfio-ccw.rst. * Message subchannels. No Linux driver currently exists. * CHSC subchannels (at most one). The chsc subchannel driver can be used to send asynchronous chsc commands. diff --git a/Documentation/s390/3270.rst b/Documentation/s390/3270.rst new file mode 100644 index 000000000000..e09e77954238 --- /dev/null +++ b/Documentation/s390/3270.rst @@ -0,0 +1,298 @@ +=============================== +IBM 3270 Display System support +=============================== + +This file describes the driver that supports local channel attachment +of IBM 3270 devices. It consists of three sections: + + * Introduction + * Installation + * Operation + + +Introduction +============ + +This paper describes installing and operating 3270 devices under +Linux/390. A 3270 device is a block-mode rows-and-columns terminal of +which I'm sure hundreds of millions were sold by IBM and clonemakers +twenty and thirty years ago. + +You may have 3270s in-house and not know it. If you're using the +VM-ESA operating system, define a 3270 to your virtual machine by using +the command "DEF GRAF " This paper presumes you will be +defining four 3270s with the CP/CMS commands: + + - DEF GRAF 620 + - DEF GRAF 621 + - DEF GRAF 622 + - DEF GRAF 623 + +Your network connection from VM-ESA allows you to use x3270, tn3270, or +another 3270 emulator, started from an xterm window on your PC or +workstation. With the DEF GRAF command, an application such as xterm, +and this Linux-390 3270 driver, you have another way of talking to your +Linux box. + +This paper covers installation of the driver and operation of a +dialed-in x3270. + + +Installation +============ + +You install the driver by installing a patch, doing a kernel build, and +running the configuration script (config3270.sh, in this directory). + +WARNING: If you are using 3270 console support, you must rerun the +configuration script every time you change the console's address (perhaps +by using the condev= parameter in silo's /boot/parmfile). More precisely, +you should rerun the configuration script every time your set of 3270s, +including the console 3270, changes subchannel identifier relative to +one another. ReIPL as soon as possible after running the configuration +script and the resulting /tmp/mkdev3270. + +If you have chosen to make tub3270 a module, you add a line to a +configuration file under /etc/modprobe.d/. If you are working on a VM +virtual machine, you can use DEF GRAF to define virtual 3270 devices. + +You may generate both 3270 and 3215 console support, or one or the +other, or neither. If you generate both, the console type under VM is +not changed. Use #CP Q TERM to see what the current console type is. +Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only +3270 console support, then the driver automatically converts your console +at boot time to a 3270 if it is a 3215. + +In brief, these are the steps: + + 1. Install the tub3270 patch + 2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf` + 3. (If VM) define devices with DEF GRAF + 4. Reboot + 5. Configure + +To test that everything works, assuming VM and x3270, + + 1. Bring up an x3270 window. + 2. Use the DIAL command in that window. + 3. You should immediately see a Linux login screen. + +Here are the installation steps in detail: + + 1. The 3270 driver is a part of the official Linux kernel + source. Build a tree with the kernel source and any necessary + patches. Then do:: + + make oldconfig + (If you wish to disable 3215 console support, edit + .config; change CONFIG_TN3215's value to "n"; + and rerun "make oldconfig".) + make image + make modules + make modules_install + + 2. (Perform this step only if you have configured tub3270 as a + module.) Add a line to a file `/etc/modprobe.d/*.conf` to automatically + load the driver when it's needed. With this line added, you will see + login prompts appear on your 3270s as soon as boot is complete (or + with emulated 3270s, as soon as you dial into your vm guest using the + command "DIAL "). Since the line-mode major number is + 227, the line to add should be:: + + alias char-major-227 tub3270 + + 3. Define graphic devices to your vm guest machine, if you + haven't already. Define them before you reboot (reipl): + + - DEFINE GRAF 620 + - DEFINE GRAF 621 + - DEFINE GRAF 622 + - DEFINE GRAF 623 + + 4. Reboot. The reboot process scans hardware devices, including + 3270s, and this enables the tub3270 driver once loaded to respond + correctly to the configuration requests of the next step. If + you have chosen 3270 console support, your console now behaves + as a 3270, not a 3215. + + 5. Run the 3270 configuration script config3270. It is + distributed in this same directory, Documentation/s390, as + config3270.sh. Inspect the output script it produces, + /tmp/mkdev3270, and then run that script. This will create the + necessary character special device files and make the necessary + changes to /etc/inittab. + + Then notify /sbin/init that /etc/inittab has changed, by issuing + the telinit command with the q operand:: + + cd Documentation/s390 + sh config3270.sh + sh /tmp/mkdev3270 + telinit q + + This should be sufficient for your first time. If your 3270 + configuration has changed and you're reusing config3270, you + should follow these steps:: + + Change 3270 configuration + Reboot + Run config3270 and /tmp/mkdev3270 + Reboot + +Here are the testing steps in detail: + + 1. Bring up an x3270 window, or use an actual hardware 3278 or + 3279, or use the 3270 emulator of your choice. You would be + running the emulator on your PC or workstation. You would use + the command, for example:: + + x3270 vm-esa-domain-name & + + if you wanted a 3278 Model 4 with 43 rows of 80 columns, the + default model number. The driver does not take advantage of + extended attributes. + + The screen you should now see contains a VM logo with input + lines near the bottom. Use TAB to move to the bottom line, + probably labeled "COMMAND ===>". + + 2. Use the DIAL command instead of the LOGIN command to connect + to one of the virtual 3270s you defined with the DEF GRAF + commands:: + + dial my-vm-guest-name + + 3. You should immediately see a login prompt from your + Linux-390 operating system. If that does not happen, you would + see instead the line "DIALED TO my-vm-guest-name 0620". + + To troubleshoot: do these things. + + A. Is the driver loaded? Use the lsmod command (no operands) + to find out. Probably it isn't. Try loading it manually, with + the command "insmod tub3270". Does that command give error + messages? Ha! There's your problem. + + B. Is the /etc/inittab file modified as in installation step 3 + above? Use the grep command to find out; for instance, issue + "grep 3270 /etc/inittab". Nothing found? There's your + problem! + + C. Are the device special files created, as in installation + step 2 above? Use the ls -l command to find out; for instance, + issue "ls -l /dev/3270/tty620". The output should start with the + letter "c" meaning character device and should contain "227, 1" + just to the left of the device name. No such file? no "c"? + Wrong major number? Wrong minor number? There's your + problem! + + D. Do you get the message:: + + "HCPDIA047E my-vm-guest-name 0620 does not exist"? + + If so, you must issue the command "DEF GRAF 620" from your VM + 3215 console and then reboot the system. + + + +OPERATION. +========== + +The driver defines three areas on the 3270 screen: the log area, the +input area, and the status area. + +The log area takes up all but the bottom two lines of the screen. The +driver writes terminal output to it, starting at the top line and going +down. When it fills, the status area changes from "Linux Running" to +"Linux More...". After a scrolling timeout of (default) 5 sec, the +screen clears and more output is written, from the top down. + +The input area extends from the beginning of the second-to-last screen +line to the start of the status area. You type commands in this area +and hit ENTER to execute them. + +The status area initializes to "Linux Running" to give you a warm +fuzzy feeling. When the log area fills up and output awaits, it +changes to "Linux More...". At this time you can do several things or +nothing. If you do nothing, the screen will clear in (default) 5 sec +and more output will appear. You may hit ENTER with nothing typed in +the input area to toggle between "Linux More..." and "Linux Holding", +which indicates no scrolling will occur. (If you hit ENTER with "Linux +Running" and nothing typed, the application receives a newline.) + +You may change the scrolling timeout value. For example, the following +command line:: + + echo scrolltime=60 > /proc/tty/driver/tty3270 + +changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if +you wish to prevent scrolling entirely. + +Other things you may do when the log area fills up are: hit PA2 to +clear the log area and write more output to it, or hit CLEAR to clear +the log area and the input area and write more output to the log area. + +Some of the Program Function (PF) and Program Attention (PA) keys are +preassigned special functions. The ones that are not yield an alarm +when pressed. + +PA1 causes a SIGINT to the currently running application. You may do +the same thing from the input area, by typing "^C" and hitting ENTER. + +PA2 causes the log area to be cleared. If output awaits, it is then +written to the log area. + +PF3 causes an EOF to be received as input by the application. You may +cause an EOF also by typing "^D" and hitting ENTER. + +No PF key is preassigned to cause a job suspension, but you may cause a +job suspension by typing "^Z" and hitting ENTER. You may wish to +assign this function to a PF key. To make PF7 cause job suspension, +execute the command:: + + echo pf7=^z > /proc/tty/driver/tty3270 + +If the input you type does not end with the two characters "^n", the +driver appends a newline character and sends it to the tty driver; +otherwise the driver strips the "^n" and does not append a newline. +The IBM 3215 driver behaves similarly. + +Pf10 causes the most recent command to be retrieved from the tube's +command stack (default depth 20) and displayed in the input area. You +may hit PF10 again for the next-most-recent command, and so on. A +command is entered into the stack only when the input area is not made +invisible (such as for password entry) and it is not identical to the +current top entry. PF10 rotates backward through the command stack; +PF11 rotates forward. You may assign the backward function to any PF +key (or PA key, for that matter), say, PA3, with the command:: + + echo -e pa3=\\033k > /proc/tty/driver/tty3270 + +This assigns the string ESC-k to PA3. Similarly, the string ESC-j +performs the forward function. (Rationale: In bash with vi-mode line +editing, ESC-k and ESC-j retrieve backward and forward history. +Suggestions welcome.) + +Is a stack size of twenty commands not to your liking? Change it on +the fly. To change to saving the last 100 commands, execute the +command:: + + echo recallsize=100 > /proc/tty/driver/tty3270 + +Have a command you issue frequently? Assign it to a PF or PA key! Use +the command:: + + echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 + +to execute the commands mkdir foobar and cd foobar immediately when you +hit PF24. Want to see the command line first, before you execute it? +Use the -n option of the echo command:: + + echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270 + + + +Happy testing! I welcome any and all comments about this document, the +driver, etc etc. + +Dick Hitt diff --git a/Documentation/s390/3270.txt b/Documentation/s390/3270.txt deleted file mode 100644 index 7c715de99774..000000000000 --- a/Documentation/s390/3270.txt +++ /dev/null @@ -1,271 +0,0 @@ -IBM 3270 Display System support - -This file describes the driver that supports local channel attachment -of IBM 3270 devices. It consists of three sections: - * Introduction - * Installation - * Operation - - -INTRODUCTION. - -This paper describes installing and operating 3270 devices under -Linux/390. A 3270 device is a block-mode rows-and-columns terminal of -which I'm sure hundreds of millions were sold by IBM and clonemakers -twenty and thirty years ago. - -You may have 3270s in-house and not know it. If you're using the -VM-ESA operating system, define a 3270 to your virtual machine by using -the command "DEF GRAF " This paper presumes you will be -defining four 3270s with the CP/CMS commands - - DEF GRAF 620 - DEF GRAF 621 - DEF GRAF 622 - DEF GRAF 623 - -Your network connection from VM-ESA allows you to use x3270, tn3270, or -another 3270 emulator, started from an xterm window on your PC or -workstation. With the DEF GRAF command, an application such as xterm, -and this Linux-390 3270 driver, you have another way of talking to your -Linux box. - -This paper covers installation of the driver and operation of a -dialed-in x3270. - - -INSTALLATION. - -You install the driver by installing a patch, doing a kernel build, and -running the configuration script (config3270.sh, in this directory). - -WARNING: If you are using 3270 console support, you must rerun the -configuration script every time you change the console's address (perhaps -by using the condev= parameter in silo's /boot/parmfile). More precisely, -you should rerun the configuration script every time your set of 3270s, -including the console 3270, changes subchannel identifier relative to -one another. ReIPL as soon as possible after running the configuration -script and the resulting /tmp/mkdev3270. - -If you have chosen to make tub3270 a module, you add a line to a -configuration file under /etc/modprobe.d/. If you are working on a VM -virtual machine, you can use DEF GRAF to define virtual 3270 devices. - -You may generate both 3270 and 3215 console support, or one or the -other, or neither. If you generate both, the console type under VM is -not changed. Use #CP Q TERM to see what the current console type is. -Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only -3270 console support, then the driver automatically converts your console -at boot time to a 3270 if it is a 3215. - -In brief, these are the steps: - 1. Install the tub3270 patch - 2. (If a module) add a line to a file in /etc/modprobe.d/*.conf - 3. (If VM) define devices with DEF GRAF - 4. Reboot - 5. Configure - -To test that everything works, assuming VM and x3270, - 1. Bring up an x3270 window. - 2. Use the DIAL command in that window. - 3. You should immediately see a Linux login screen. - -Here are the installation steps in detail: - - 1. The 3270 driver is a part of the official Linux kernel - source. Build a tree with the kernel source and any necessary - patches. Then do - make oldconfig - (If you wish to disable 3215 console support, edit - .config; change CONFIG_TN3215's value to "n"; - and rerun "make oldconfig".) - make image - make modules - make modules_install - - 2. (Perform this step only if you have configured tub3270 as a - module.) Add a line to a file /etc/modprobe.d/*.conf to automatically - load the driver when it's needed. With this line added, you will see - login prompts appear on your 3270s as soon as boot is complete (or - with emulated 3270s, as soon as you dial into your vm guest using the - command "DIAL "). Since the line-mode major number is - 227, the line to add should be: - alias char-major-227 tub3270 - - 3. Define graphic devices to your vm guest machine, if you - haven't already. Define them before you reboot (reipl): - DEFINE GRAF 620 - DEFINE GRAF 621 - DEFINE GRAF 622 - DEFINE GRAF 623 - - 4. Reboot. The reboot process scans hardware devices, including - 3270s, and this enables the tub3270 driver once loaded to respond - correctly to the configuration requests of the next step. If - you have chosen 3270 console support, your console now behaves - as a 3270, not a 3215. - - 5. Run the 3270 configuration script config3270. It is - distributed in this same directory, Documentation/s390, as - config3270.sh. Inspect the output script it produces, - /tmp/mkdev3270, and then run that script. This will create the - necessary character special device files and make the necessary - changes to /etc/inittab. - - Then notify /sbin/init that /etc/inittab has changed, by issuing - the telinit command with the q operand: - cd Documentation/s390 - sh config3270.sh - sh /tmp/mkdev3270 - telinit q - - This should be sufficient for your first time. If your 3270 - configuration has changed and you're reusing config3270, you - should follow these steps: - Change 3270 configuration - Reboot - Run config3270 and /tmp/mkdev3270 - Reboot - -Here are the testing steps in detail: - - 1. Bring up an x3270 window, or use an actual hardware 3278 or - 3279, or use the 3270 emulator of your choice. You would be - running the emulator on your PC or workstation. You would use - the command, for example, - x3270 vm-esa-domain-name & - if you wanted a 3278 Model 4 with 43 rows of 80 columns, the - default model number. The driver does not take advantage of - extended attributes. - - The screen you should now see contains a VM logo with input - lines near the bottom. Use TAB to move to the bottom line, - probably labeled "COMMAND ===>". - - 2. Use the DIAL command instead of the LOGIN command to connect - to one of the virtual 3270s you defined with the DEF GRAF - commands: - dial my-vm-guest-name - - 3. You should immediately see a login prompt from your - Linux-390 operating system. If that does not happen, you would - see instead the line "DIALED TO my-vm-guest-name 0620". - - To troubleshoot: do these things. - - A. Is the driver loaded? Use the lsmod command (no operands) - to find out. Probably it isn't. Try loading it manually, with - the command "insmod tub3270". Does that command give error - messages? Ha! There's your problem. - - B. Is the /etc/inittab file modified as in installation step 3 - above? Use the grep command to find out; for instance, issue - "grep 3270 /etc/inittab". Nothing found? There's your - problem! - - C. Are the device special files created, as in installation - step 2 above? Use the ls -l command to find out; for instance, - issue "ls -l /dev/3270/tty620". The output should start with the - letter "c" meaning character device and should contain "227, 1" - just to the left of the device name. No such file? no "c"? - Wrong major number? Wrong minor number? There's your - problem! - - D. Do you get the message - "HCPDIA047E my-vm-guest-name 0620 does not exist"? - If so, you must issue the command "DEF GRAF 620" from your VM - 3215 console and then reboot the system. - - - -OPERATION. - -The driver defines three areas on the 3270 screen: the log area, the -input area, and the status area. - -The log area takes up all but the bottom two lines of the screen. The -driver writes terminal output to it, starting at the top line and going -down. When it fills, the status area changes from "Linux Running" to -"Linux More...". After a scrolling timeout of (default) 5 sec, the -screen clears and more output is written, from the top down. - -The input area extends from the beginning of the second-to-last screen -line to the start of the status area. You type commands in this area -and hit ENTER to execute them. - -The status area initializes to "Linux Running" to give you a warm -fuzzy feeling. When the log area fills up and output awaits, it -changes to "Linux More...". At this time you can do several things or -nothing. If you do nothing, the screen will clear in (default) 5 sec -and more output will appear. You may hit ENTER with nothing typed in -the input area to toggle between "Linux More..." and "Linux Holding", -which indicates no scrolling will occur. (If you hit ENTER with "Linux -Running" and nothing typed, the application receives a newline.) - -You may change the scrolling timeout value. For example, the following -command line: - echo scrolltime=60 > /proc/tty/driver/tty3270 -changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if -you wish to prevent scrolling entirely. - -Other things you may do when the log area fills up are: hit PA2 to -clear the log area and write more output to it, or hit CLEAR to clear -the log area and the input area and write more output to the log area. - -Some of the Program Function (PF) and Program Attention (PA) keys are -preassigned special functions. The ones that are not yield an alarm -when pressed. - -PA1 causes a SIGINT to the currently running application. You may do -the same thing from the input area, by typing "^C" and hitting ENTER. - -PA2 causes the log area to be cleared. If output awaits, it is then -written to the log area. - -PF3 causes an EOF to be received as input by the application. You may -cause an EOF also by typing "^D" and hitting ENTER. - -No PF key is preassigned to cause a job suspension, but you may cause a -job suspension by typing "^Z" and hitting ENTER. You may wish to -assign this function to a PF key. To make PF7 cause job suspension, -execute the command: - echo pf7=^z > /proc/tty/driver/tty3270 - -If the input you type does not end with the two characters "^n", the -driver appends a newline character and sends it to the tty driver; -otherwise the driver strips the "^n" and does not append a newline. -The IBM 3215 driver behaves similarly. - -Pf10 causes the most recent command to be retrieved from the tube's -command stack (default depth 20) and displayed in the input area. You -may hit PF10 again for the next-most-recent command, and so on. A -command is entered into the stack only when the input area is not made -invisible (such as for password entry) and it is not identical to the -current top entry. PF10 rotates backward through the command stack; -PF11 rotates forward. You may assign the backward function to any PF -key (or PA key, for that matter), say, PA3, with the command: - echo -e pa3=\\033k > /proc/tty/driver/tty3270 -This assigns the string ESC-k to PA3. Similarly, the string ESC-j -performs the forward function. (Rationale: In bash with vi-mode line -editing, ESC-k and ESC-j retrieve backward and forward history. -Suggestions welcome.) - -Is a stack size of twenty commands not to your liking? Change it on -the fly. To change to saving the last 100 commands, execute the -command: - echo recallsize=100 > /proc/tty/driver/tty3270 - -Have a command you issue frequently? Assign it to a PF or PA key! Use -the command - echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 -to execute the commands mkdir foobar and cd foobar immediately when you -hit PF24. Want to see the command line first, before you execute it? -Use the -n option of the echo command: - echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270 - - - -Happy testing! I welcome any and all comments about this document, the -driver, etc etc. - -Dick Hitt diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO deleted file mode 100644 index 6e0f63f343b4..000000000000 --- a/Documentation/s390/CommonIO +++ /dev/null @@ -1,125 +0,0 @@ -S/390 common I/O-Layer - command line parameters, procfs and debugfs entries -============================================================================ - -Command line parameters ------------------------ - -* ccw_timeout_log - - Enable logging of debug information in case of ccw device timeouts. - -* cio_ignore = device[,device[,..]] - - device := {all | [!]ipldev | [!]condev | [!] | [!]-} - - The given devices will be ignored by the common I/O-layer; no detection - and device sensing will be done on any of those devices. The subchannel to - which the device in question is attached will be treated as if no device was - attached. - - An ignored device can be un-ignored later; see the "/proc entries"-section for - details. - - The devices must be given either as bus ids (0.x.abcd) or as hexadecimal - device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you - give a device number 0xabcd, it will be interpreted as 0.0.abcd. - - You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev' - keywords can be used to refer to the CCW based boot device and CCW console - device respectively (these are probably useful only when combined with the '!' - operator). The '!' operator will cause the I/O-layer to _not_ ignore a device. - The command line is parsed from left to right. - - For example, - cio_ignore=0.0.0023-0.0.0042,0.0.4711 - will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device - 0.0.4711, if detected. - As another example, - cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02 - will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02. - - By default, no devices are ignored. - - -/proc entries -------------- - -* /proc/cio_ignore - - Lists the ranges of devices (by bus id) which are ignored by common I/O. - - You can un-ignore certain or all devices by piping to /proc/cio_ignore. - "free all" will un-ignore all ignored devices, - "free , , ..." will un-ignore the specified - devices. - - For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored, - - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore - will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023 - to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored; - - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device - 0.0.0041; - - echo free all > /proc/cio_ignore will un-ignore all remaining ignored - devices. - - When a device is un-ignored, device recognition and sensing is performed and - the device driver will be notified if possible, so the device will become - available to the system. Note that un-ignoring is performed asynchronously. - - You can also add ranges of devices to be ignored by piping to - /proc/cio_ignore; "add , , ..." will ignore the - specified devices. - - Note: While already known devices can be added to the list of devices to be - ignored, there will be no effect on then. However, if such a device - disappears and then reappears, it will then be ignored. To make - known devices go away, you need the "purge" command (see below). - - For example, - "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore" - will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored - devices. - - You can remove already known but now ignored devices via - "echo purge > /proc/cio_ignore" - All devices ignored but still registered and not online (= not in use) - will be deregistered and thus removed from the system. - - The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward - compatibility, by the device number in hexadecimal (0xabcd or abcd). Device - numbers given as 0xabcd will be interpreted as 0.0.abcd. - -* /proc/cio_settle - - A write request to this file is blocked until all queued cio actions are - handled. This will allow userspace to wait for pending work affecting - device availability after changing cio_ignore or the hardware configuration. - -* For some of the information present in the /proc filesystem in 2.4 (namely, - /proc/subchannels and /proc/chpids), see driver-model.txt. - Information formerly in /proc/irq_count is now in /proc/interrupts. - - -debugfs entries ---------------- - -* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature) - - Some views generated by the debug feature to hold various debug outputs. - - - /sys/kernel/debug/s390dbf/cio_crw/sprintf - Messages from the processing of pending channel report words (machine check - handling). - - - /sys/kernel/debug/s390dbf/cio_msg/sprintf - Various debug messages from the common I/O-layer. - - - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii - Logs the calling of functions in the common I/O-layer and, if applicable, - which subchannel they were called for, as well as dumps of some data - structures (like irb in an error case). - - The level of logging can be changed to be more or less verbose by piping to - /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the - documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt) - for details. diff --git a/Documentation/s390/DASD b/Documentation/s390/DASD deleted file mode 100644 index 9963f1e9c98a..000000000000 --- a/Documentation/s390/DASD +++ /dev/null @@ -1,73 +0,0 @@ -DASD device driver - -S/390's disk devices (DASDs) are managed by Linux via the DASD device -driver. It is valid for all types of DASDs and represents them to -Linux as block devices, namely "dd". Currently the DASD driver uses a -single major number (254) and 4 minor numbers per volume (1 for the -physical volume and 3 for partitions). With respect to partitions see -below. Thus you may have up to 64 DASD devices in your system. - -The kernel parameter 'dasd=from-to,...' may be issued arbitrary times -in the kernel's parameter line or not at all. The 'from' and 'to' -parameters are to be given in hexadecimal notation without a leading -0x. -If you supply kernel parameters the different instances are processed -in order of appearance and a minor number is reserved for any device -covered by the supplied range up to 64 volumes. Additional DASDs are -ignored. If you do not supply the 'dasd=' kernel parameter at all, the -DASD driver registers all supported DASDs of your system to a minor -number in ascending order of the subchannel number. - -The driver currently supports ECKD-devices and there are stubs for -support of the FBA and CKD architectures. For the FBA architecture -only some smart data structures are missing to make the support -complete. -We performed our testing on 3380 and 3390 type disks of different -sizes, under VM and on the bare hardware (LPAR), using internal disks -of the multiprise as well as a RAMAC virtual array. Disks exported by -an Enterprise Storage Server (Seascape) should work fine as well. - -We currently implement one partition per volume, which is the whole -volume, skipping the first blocks up to the volume label. These are -reserved for IPL records and IBM's volume label to assure -accessibility of the DASD from other OSs. In a later stage we will -provide support of partitions, maybe VTOC oriented or using a kind of -partition table in the label record. - -USAGE - --Low-level format (?CKD only) -For using an ECKD-DASD as a Linux harddisk you have to low-level -format the tracks by issuing the BLKDASDFORMAT-ioctl on that -device. This will erase any data on that volume including IBM volume -labels, VTOCs etc. The ioctl may take a 'struct format_data *' or -'NULL' as an argument. -typedef struct { - int start_unit; - int stop_unit; - int blksize; -} format_data_t; -When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole -disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit -and stop_unit are the first and last track to be formatted. If -stop_unit is -1 it implies that the DASD is formatted from start_unit -up to the last track. blksize can be any power of two between 512 and -4096. We recommend no blksize lower than 1024 because the ext2fs uses -1kB blocks anyway and you gain approx. 50% of capacity increasing your -blksize from 512 byte to 1kB. - --Make a filesystem -Then you can mk??fs the filesystem of your choice on that volume or -partition. For reasons of sanity you should build your filesystem on -the partition /dev/dd?1 instead of the whole volume. You only lose 3kB -but may be sure that you can reuse your data after introduction of a -real partition table. - -BUGS: -- Performance sometimes is rather low because we don't fully exploit clustering - -TODO-List: -- Add IBM'S Disk layout to genhd -- Enhance driver to use more than one major number -- Enable usage as a module -- Support Cache fast write and DASD fast write (ECKD) diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt deleted file mode 100644 index c35804c238ad..000000000000 --- a/Documentation/s390/Debugging390.txt +++ /dev/null @@ -1,2172 +0,0 @@ - - Debugging on Linux for s/390 & z/Architecture - by - Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation - Best viewed with fixed width fonts - -Overview of Document: -===================== -This document is intended to give a good overview of how to debug Linux for -s/390 and z/Architecture. It is not intended as a complete reference and not a -tutorial on the fundamentals of C & assembly. It doesn't go into -390 IO in any detail. It is intended to complement the documents in the -reference section below & any other worthwhile references you get. - -It is intended like the Enterprise Systems Architecture/390 Reference Summary -to be printed out & used as a quick cheat sheet self help style reference when -problems occur. - -Contents -======== -Register Set -Address Spaces on Intel Linux -Address Spaces on Linux for s/390 & z/Architecture -The Linux for s/390 & z/Architecture Kernel Task Structure -Register Usage & Stackframes on Linux for s/390 & z/Architecture -A sample program with comments -Compiling programs for debugging on Linux for s/390 & z/Architecture -Debugging under VM -s/390 & z/Architecture IO Overview -Debugging IO on s/390 & z/Architecture under VM -GDB on s/390 & z/Architecture -Stack chaining in gdb by hand -Examining core dumps -ldd -Debugging modules -The proc file system -SysRq -References -Special Thanks - -Register Set -============ -The current architectures have the following registers. - -16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture, -r0-r15 (or gpr0-gpr15), used for arithmetic and addressing. - -16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15, -kernel usage only, used for memory management, interrupt control, debugging -control etc. - -16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture, -normally not used by normal programs but potentially could be used as -temporary storage. These registers have a 1:1 association with general -purpose registers and are designed to be used in the so-called access -register mode to select different address spaces. -Access register 0 (and access register 1 on z/Architecture, which needs a -64 bit pointer) is currently used by the pthread library as a pointer to -the current running threads private area. - -16 64 bit floating point registers (fp0-fp15 ) IEEE & HFP floating -point format compliant on G5 upwards & a Floating point control reg (FPC) -4 64 bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines. -Note: -Linux (currently) always uses IEEE & emulates G5 IEEE format on older machines, -( provided the kernel is configured for this ). - - -The PSW is the most important register on the machine it -is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of -a program counter (pc), condition code register,memory space designator. -In IBM standard notation I am counting bit 0 as the MSB. -It has several advantages over a normal program counter -in that you can change address translation & program counter -in a single instruction. To change address translation, -e.g. switching address translation off requires that you -have a logical=physical mapping for the address you are -currently running at. - -+-------------------------+-------------------------------------------------+ -| Bit | | -+--------+----------------+ Value | -| s/390 | z/Architecture | | -+========+================+=================================================+ -| 0 | 0 | Reserved (must be 0) otherwise specification | -| | | exception occurs. | -+--------+----------------+-------------------------------------------------+ -| 1 | 1 | Program Event Recording 1 PER enabled, | -| | | PER is used to facilitate debugging e.g. | -| | | single stepping. | -+--------+----------------+-------------------------------------------------+ -| 2-4 | 2-4 | Reserved (must be 0). | -+--------+----------------+-------------------------------------------------+ -| 5 | 5 | Dynamic address translation 1=DAT on. | -+--------+----------------+-------------------------------------------------+ -| 6 | 6 | Input/Output interrupt Mask | -+--------+----------------+-------------------------------------------------+ -| 7 | 7 | External interrupt Mask used primarily for | -| | | interprocessor signalling and clock interrupts. | -+--------+----------------+-------------------------------------------------+ -| 8-11 | 8-11 | PSW Key used for complex memory protection | -| | | mechanism (not used under linux) | -+--------+----------------+-------------------------------------------------+ -| 12 | 12 | 1 on s/390 0 on z/Architecture | -+--------+----------------+-------------------------------------------------+ -| 13 | 13 | Machine Check Mask 1=enable machine check | -| | | interrupts | -+--------+----------------+-------------------------------------------------+ -| 14 | 14 | Wait State. Set this to 1 to stop the processor | -| | | except for interrupts and give time to other | -| | | LPARS. Used in CPU idle in the kernel to | -| | | increase overall usage of processor resources. | -+--------+----------------+-------------------------------------------------+ -| 15 | 15 | Problem state (if set to 1 certain instructions | -| | | are disabled). All linux user programs run with | -| | | this bit 1 (useful info for debugging under VM).| -+--------+----------------+-------------------------------------------------+ -| 16-17 | 16-17 | Address Space Control | -| | | | -| | | 00 Primary Space Mode: | -| | | | -| | | The register CR1 contains the primary | -| | | address-space control element (PASCE), which | -| | | points to the primary space region/segment | -| | | table origin. | -| | | | -| | | 01 Access register mode | -| | | | -| | | 10 Secondary Space Mode: | -| | | | -| | | The register CR7 contains the secondary | -| | | address-space control element (SASCE), which | -| | | points to the secondary space region or | -| | | segment table origin. | -| | | | -| | | 11 Home Space Mode: | -| | | | -| | | The register CR13 contains the home space | -| | | address-space control element (HASCE), which | -| | | points to the home space region/segment | -| | | table origin. | -| | | | -| | | See "Address Spaces on Linux for s/390 & | -| | | z/Architecture" below for more information | -| | | about address space usage in Linux. | -+--------+----------------+-------------------------------------------------+ -| 18-19 | 18-19 | Condition codes (CC) | -+--------+----------------+-------------------------------------------------+ -| 20 | 20 | Fixed point overflow mask if 1=FPU exceptions | -| | | for this event occur (normally 0) | -+--------+----------------+-------------------------------------------------+ -| 21 | 21 | Decimal overflow mask if 1=FPU exceptions for | -| | | this event occur (normally 0) | -+--------+----------------+-------------------------------------------------+ -| 22 | 22 | Exponent underflow mask if 1=FPU exceptions | -| | | for this event occur (normally 0) | -+--------+----------------+-------------------------------------------------+ -| 23 | 23 | Significance Mask if 1=FPU exceptions for this | -| | | event occur (normally 0) | -+--------+----------------+-------------------------------------------------+ -| 24-31 | 24-30 | Reserved Must be 0. | -| +----------------+-------------------------------------------------+ -| | 31 | Extended Addressing Mode | -| +----------------+-------------------------------------------------+ -| | 32 | Basic Addressing Mode | -| | | | -| | | Used to set addressing mode | -| | | | -| | | +---------+----------+----------+ | -| | | | PSW 31 | PSW 32 | | | -| | | +---------+----------+----------+ | -| | | | 0 | 0 | 24 bit | | -| | | +---------+----------+----------+ | -| | | | 0 | 1 | 31 bit | | -| | | +---------+----------+----------+ | -| | | | 1 | 1 | 64 bit | | -| | | +---------+----------+----------+ | -+--------+----------------+-------------------------------------------------+ -| 32 | | 1=31 bit addressing mode 0=24 bit addressing | -| | | mode (for backward compatibility), linux | -| | | always runs with this bit set to 1 | -+--------+----------------+-------------------------------------------------+ -| 33-64 | | Instruction address. | -| +----------------+-------------------------------------------------+ -| | 33-63 | Reserved must be 0 | -| +----------------+-------------------------------------------------+ -| | 64-127 | Address | -| | | | -| | | - In 24 bits mode bits 64-103=0 bits 104-127 | -| | | Address | -| | | - In 31 bits mode bits 64-96=0 bits 97-127 | -| | | Address | -| | | | -| | | Note: | -| | | unlike 31 bit mode on s/390 bit 96 must be | -| | | zero when loading the address with LPSWE | -| | | otherwise a specification exception occurs, | -| | | LPSW is fully backward compatible. | -+--------+----------------+-------------------------------------------------+ - -Prefix Page(s) --------------- -This per cpu memory area is too intimately tied to the processor not to mention. -It exists between the real addresses 0-4096 on s/390 and between 0-8192 on -z/Architecture and is exchanged with one page on s/390 or two pages on -z/Architecture in absolute storage by the set prefix instruction during Linux -startup. -This page is mapped to a different prefix for each processor in an SMP -configuration (assuming the OS designer is sane of course). -Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on -z/Architecture are used by the processor itself for holding such information -as exception indications and entry points for exceptions. -Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and -z/Architecture (there is a gap on z/Architecture currently between 0xc00 and -0x1000, too, which is used by Linux). -The closest thing to this on traditional architectures is the interrupt -vector table. This is a good thing & does simplify some of the kernel coding -however it means that we now cannot catch stray NULL pointers in the -kernel without hard coded checks. - - - -Address Spaces on Intel Linux -============================= - -The traditional Intel Linux is approximately mapped as follows forgive -the ascii art. -0xFFFFFFFF 4GB Himem ***************** - * * - * Kernel Space * - * * - ***************** **************** -User Space Himem * User Stack * * * -(typically 0xC0000000 3GB ) ***************** * * - * Shared Libs * * Next Process * - ***************** * to * - * * <== * Run * <== - * User Program * * * - * Data BSS * * * - * Text * * * - * Sections * * * -0x00000000 ***************** **************** - -Now it is easy to see that on Intel it is quite easy to recognise a kernel -address as being one greater than user space himem (in this case 0xC0000000), -and addresses of less than this are the ones in the current running program on -this processor (if an smp box). -If using the virtual machine ( VM ) as a debugger it is quite difficult to -know which user process is running as the address space you are looking at -could be from any process in the run queue. - -The limitation of Intels addressing technique is that the linux -kernel uses a very simple real address to virtual addressing technique -of Real Address=Virtual Address-User Space Himem. -This means that on Intel the kernel linux can typically only address -Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines -can typically use. -They can lower User Himem to 2GB or lower & thus be -able to use 2GB of RAM however this shrinks the maximum size -of User Space from 3GB to 2GB they have a no win limit of 4GB unless -they go to 64 Bit. - - -On 390 our limitations & strengths make us slightly different. -For backward compatibility we are only allowed use 31 bits (2GB) -of our 32 bit addresses, however, we use entirely separate address -spaces for the user & kernel. - -This means we can support 2GB of non Extended RAM on s/390, & more -with the Extended memory management swap device & -currently 4TB of physical memory currently on z/Architecture. - - -Address Spaces on Linux for s/390 & z/Architecture -================================================== - -Our addressing scheme is basically as follows: - - Primary Space Home Space -Himem 0x7fffffff 2GB on s/390 ***************** **************** -currently 0x3ffffffffff (2^42)-1 * User Stack * * * -on z/Architecture. ***************** * * - * Shared Libs * * * - ***************** * * - * * * Kernel * - * User Program * * * - * Data BSS * * * - * Text * * * - * Sections * * * -0x00000000 ***************** **************** - -This also means that we need to look at the PSW problem state bit and the -addressing mode to decide whether we are looking at user or kernel space. - -User space runs in primary address mode (or access register mode within -the vdso code). - -The kernel usually also runs in home space mode, however when accessing -user space the kernel switches to primary or secondary address mode if -the mvcos instruction is not available or if a compare-and-swap (futex) -instruction on a user space address is performed. - -When also looking at the ASCE control registers, this means: - -User space: -- runs in primary or access register mode -- cr1 contains the user asce -- cr7 contains the user asce -- cr13 contains the kernel asce - -Kernel space: -- runs in home space mode -- cr1 contains the user or kernel asce - -> the kernel asce is loaded when a uaccess requires primary or - secondary address mode -- cr7 contains the user or kernel asce, (changed with set_fs()) -- cr13 contains the kernel asce - -In case of uaccess the kernel changes to: -- primary space mode in case of a uaccess (copy_to_user) and uses - e.g. the mvcp instruction to access user space. However the kernel - will stay in home space mode if the mvcos instruction is available -- secondary space mode in case of futex atomic operations, so that the - instructions come from primary address space and data from secondary - space - -In case of KVM, the kernel runs in home space mode, but cr1 gets switched -to contain the gmap asce before the SIE instruction gets executed. When -the SIE instruction is finished, cr1 will be switched back to contain the -user asce. - - -Virtual Addresses on s/390 & z/Architecture -=========================================== - -A virtual address on s/390 is made up of 3 parts -The SX (segment index, roughly corresponding to the PGD & PMD in Linux -terminology) being bits 1-11. -The PX (page index, corresponding to the page table entry (pte) in Linux -terminology) being bits 12-19. -The remaining bits BX (the byte index are the offset in the page ) -i.e. bits 20 to 31. - -On z/Architecture in linux we currently make up an address from 4 parts. -The region index bits (RX) 0-32 we currently use bits 22-32 -The segment index (SX) being bits 33-43 -The page index (PX) being bits 44-51 -The byte index (BX) being bits 52-63 - -Notes: -1) s/390 has no PMD so the PMD is really the PGD also. -A lot of this stuff is defined in pgtable.h. - -2) Also seeing as s/390's page indexes are only 1k in size -(bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k ) -to make the best use of memory by updating 4 segment indices -entries each time we mess with a PMD & use offsets -0,1024,2048 & 3072 in this page as for our segment indexes. -On z/Architecture our page indexes are now 2k in size -( bits 12-19 x 8 bytes per pte ) we do a similar trick -but only mess with 2 segment indices each time we mess with -a PMD. - -3) As z/Architecture supports up to a massive 5-level page table lookup we -can only use 3 currently on Linux ( as this is all the generic kernel -currently supports ) however this may change in future -this allows us to access ( according to my sums ) -4TB of virtual storage per process i.e. -4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes, -enough for another 2 or 3 of years I think :-). -to do this we use a region-third-table designation type in -our address space control registers. - - -The Linux for s/390 & z/Architecture Kernel Task Structure -========================================================== -Each process/thread under Linux for S390 has its own kernel task_struct -defined in linux/include/linux/sched.h -The S390 on initialisation & resuming of a process on a cpu sets -the __LC_KERNEL_STACK variable in the spare prefix area for this cpu -(which we use for per-processor globals). - -The kernel stack pointer is intimately tied with the task structure for -each processor as follows. - - s/390 - ************************ - * 1 page kernel stack * - * ( 4K ) * - ************************ - * 1 page task_struct * - * ( 4K ) * -8K aligned ************************ - - z/Architecture - ************************ - * 2 page kernel stack * - * ( 8K ) * - ************************ - * 2 page task_struct * - * ( 8K ) * -16K aligned ************************ - -What this means is that we don't need to dedicate any register or global -variable to point to the current running process & can retrieve it with the -following very simple construct for s/390 & one very similar for z/Architecture. - -static inline struct task_struct * get_current(void) -{ - struct task_struct *current; - __asm__("lhi %0,-8192\n\t" - "nr %0,15" - : "=r" (current) ); - return current; -} - -i.e. just anding the current kernel stack pointer with the mask -8192. -Thankfully because Linux doesn't have support for nested IO interrupts -& our devices have large buffers can survive interrupts being shut for -short amounts of time we don't need a separate stack for interrupts. - - - - -Register Usage & Stackframes on Linux for s/390 & z/Architecture -================================================================= -Overview: ---------- -This is the code that gcc produces at the top & the bottom of -each function. It usually is fairly consistent & similar from -function to function & if you know its layout you can probably -make some headway in finding the ultimate cause of a problem -after a crash without a source level debugger. - -Note: To follow stackframes requires a knowledge of C or Pascal & -limited knowledge of one assembly language. - -It should be noted that there are some differences between the -s/390 and z/Architecture stack layouts as the z/Architecture stack layout -didn't have to maintain compatibility with older linkage formats. - -Glossary: ---------- -alloca: -This is a built in compiler function for runtime allocation -of extra space on the callers stack which is obviously freed -up on function exit ( e.g. the caller may choose to allocate nothing -of a buffer of 4k if required for temporary purposes ), it generates -very efficient code ( a few cycles ) when compared to alternatives -like malloc. - -automatics: These are local variables on the stack, -i.e they aren't in registers & they aren't static. - -back-chain: -This is a pointer to the stack pointer before entering a -framed functions ( see frameless function ) prologue got by -dereferencing the address of the current stack pointer, - i.e. got by accessing the 32 bit value at the stack pointers -current location. - -base-pointer: -This is a pointer to the back of the literal pool which -is an area just behind each procedure used to store constants -in each function. - -call-clobbered: The caller probably needs to save these registers if there -is something of value in them, on the stack or elsewhere before making a -call to another procedure so that it can restore it later. - -epilogue: -The code generated by the compiler to return to the caller. - -frameless-function -A frameless function in Linux for s390 & z/Architecture is one which doesn't -need more than the register save area (96 bytes on s/390, 160 on z/Architecture) -given to it by the caller. -A frameless function never: -1) Sets up a back chain. -2) Calls alloca. -3) Calls other normal functions -4) Has automatics. - -GOT-pointer: -This is a pointer to the global-offset-table in ELF -( Executable Linkable Format, Linux'es most common executable format ), -all globals & shared library objects are found using this pointer. - -lazy-binding -ELF shared libraries are typically only loaded when routines in the shared -library are actually first called at runtime. This is lazy binding. - -procedure-linkage-table -This is a table found from the GOT which contains pointers to routines -in other shared libraries which can't be called to by easier means. - -prologue: -The code generated by the compiler to set up the stack frame. - -outgoing-args: -This is extra area allocated on the stack of the calling function if the -parameters for the callee's cannot all be put in registers, the same -area can be reused by each function the caller calls. - -routine-descriptor: -A COFF executable format based concept of a procedure reference -actually being 8 bytes or more as opposed to a simple pointer to the routine. -This is typically defined as follows -Routine Descriptor offset 0=Pointer to Function -Routine Descriptor offset 4=Pointer to Table of Contents -The table of contents/TOC is roughly equivalent to a GOT pointer. -& it means that shared libraries etc. can be shared between several -environments each with their own TOC. - - -static-chain: This is used in nested functions a concept adopted from pascal -by gcc not used in ansi C or C++ ( although quite useful ), basically it -is a pointer used to reference local variables of enclosing functions. -You might come across this stuff once or twice in your lifetime. - -e.g. -The function below should return 11 though gcc may get upset & toss warnings -about unused variables. -int FunctionA(int a) -{ - int b; - FunctionC(int c) - { - b=c+1; - } - FunctionC(10); - return(b); -} - - -s/390 & z/Architecture Register usage -===================================== -r0 used by syscalls/assembly call-clobbered -r1 used by syscalls/assembly call-clobbered -r2 argument 0 / return value 0 call-clobbered -r3 argument 1 / return value 1 (if long long) call-clobbered -r4 argument 2 call-clobbered -r5 argument 3 call-clobbered -r6 argument 4 saved -r7 pointer-to arguments 5 to ... saved -r8 this & that saved -r9 this & that saved -r10 static-chain ( if nested function ) saved -r11 frame-pointer ( if function used alloca ) saved -r12 got-pointer saved -r13 base-pointer saved -r14 return-address saved -r15 stack-pointer saved - -f0 argument 0 / return value ( float/double ) call-clobbered -f2 argument 1 call-clobbered -f4 z/Architecture argument 2 saved -f6 z/Architecture argument 3 saved -The remaining floating points -f1,f3,f5 f7-f15 are call-clobbered. - -Notes: ------- -1) The only requirement is that registers which are used -by the callee are saved, e.g. the compiler is perfectly -capable of using r11 for purposes other than a frame a -frame pointer if a frame pointer is not needed. -2) In functions with variable arguments e.g. printf the calling procedure -is identical to one without variable arguments & the same number of -parameters. However, the prologue of this function is somewhat more -hairy owing to it having to move these parameters to the stack to -get va_start, va_arg & va_end to work. -3) Access registers are currently unused by gcc but are used in -the kernel. Possibilities exist to use them at the moment for -temporary storage but it isn't recommended. -4) Only 4 of the floating point registers are used for -parameter passing as older machines such as G3 only have only 4 -& it keeps the stack frame compatible with other compilers. -However with IEEE floating point emulation under linux on the -older machines you are free to use the other 12. -5) A long long or double parameter cannot be have the -first 4 bytes in a register & the second four bytes in the -outgoing args area. It must be purely in the outgoing args -area if crossing this boundary. -6) Floating point parameters are mixed with outgoing args -on the outgoing args area in the order the are passed in as parameters. -7) Floating point arguments 2 & 3 are saved in the outgoing args area for -z/Architecture - - -Stack Frame Layout ------------------- -s/390 z/Architecture -0 0 back chain ( a 0 here signifies end of back chain ) -4 8 eos ( end of stack, not used on Linux for S390 used in other linkage formats ) -8 16 glue used in other s/390 linkage formats for saved routine descriptors etc. -12 24 glue used in other s/390 linkage formats for saved routine descriptors etc. -16 32 scratch area -20 40 scratch area -24 48 saved r6 of caller function -28 56 saved r7 of caller function -32 64 saved r8 of caller function -36 72 saved r9 of caller function -40 80 saved r10 of caller function -44 88 saved r11 of caller function -48 96 saved r12 of caller function -52 104 saved r13 of caller function -56 112 saved r14 of caller function -60 120 saved r15 of caller function -64 128 saved f4 of caller function -72 132 saved f6 of caller function -80 undefined -96 160 outgoing args passed from caller to callee -96+x 160+x possible stack alignment ( 8 bytes desirable ) -96+x+y 160+x+y alloca space of caller ( if used ) -96+x+y+z 160+x+y+z automatics of caller ( if used ) -0 back-chain - -A sample program with comments. -=============================== - -Comments on the function test ------------------------------ -1) It didn't need to set up a pointer to the constant pool gpr13 as it is not -used ( :-( ). -2) This is a frameless function & no stack is bought. -3) The compiler was clever enough to recognise that it could return the -value in r2 as well as use it for the passed in parameter ( :-) ). -4) The basr ( branch relative & save ) trick works as follows the instruction -has a special case with r0,r0 with some instruction operands is understood as -the literal value 0, some risc architectures also do this ). So now -we are branching to the next address & the address new program counter is -in r13,so now we subtract the size of the function prologue we have executed -+ the size of the literal pool to get to the top of the literal pool -0040037c int test(int b) -{ # Function prologue below - 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14 - 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using - 400382: a7 da ff fa ahi %r13,-6 # basr trick - return(5+b); - # Huge main program - 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2 - - # Function epilogue below - 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14 - 40038e: 07 fe br %r14 # return -} - -Comments on the function main ------------------------------ -1) The compiler did this function optimally ( 8-) ) - -Literal pool for main. -400390: ff ff ff ec .long 0xffffffec -main(int argc,char *argv[]) -{ # Function prologue below - 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers - 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0 - 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving - 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to - 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool - 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain - - return(test(5)); # Main Program Below - 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from - # literal pool - 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5 - 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return - # address using branch & save instruction. - - # Function Epilogue below - 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers. - 4003b8: 07 fe br %r14 # return to do program exit -} - - -Compiler updates ----------------- - -main(int argc,char *argv[]) -{ - 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15) - 400500: a7 d5 00 04 bras %r13,400508 - 400504: 00 40 04 f4 .long 0x004004f4 - # compiler now puts constant pool in code to so it saves an instruction - 400508: 18 0f lr %r0,%r15 - 40050a: a7 fa ff a0 ahi %r15,-96 - 40050e: 50 00 f0 00 st %r0,0(%r15) - return(test(5)); - 400512: 58 10 d0 00 l %r1,0(%r13) - 400516: a7 28 00 05 lhi %r2,5 - 40051a: 0d e1 basr %r14,%r1 - # compiler adds 1 extra instruction to epilogue this is done to - # avoid processor pipeline stalls owing to data dependencies on g5 & - # above as register 14 in the old code was needed directly after being loaded - # by the lm %r11,%r15,140(%r15) for the br %14. - 40051c: 58 40 f0 98 l %r4,152(%r15) - 400520: 98 7f f0 7c lm %r7,%r15,124(%r15) - 400524: 07 f4 br %r4 -} - - -Hartmut ( our compiler developer ) also has been threatening to take out the -stack backchain in optimised code as this also causes pipeline stalls, you -have been warned. - -64 bit z/Architecture code disassembly --------------------------------------- - -If you understand the stuff above you'll understand the stuff -below too so I'll avoid repeating myself & just say that -some of the instructions have g's on the end of them to indicate -they are 64 bit & the stack offsets are a bigger, -the only other difference you'll find between 32 & 64 bit is that -we now use f4 & f6 for floating point arguments on 64 bit. -00000000800005b0 : -int test(int b) -{ - return(5+b); - 800005b0: a7 2a 00 05 ahi %r2,5 - 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer - 800005b8: 07 fe br %r14 - 800005ba: 07 07 bcr 0,%r7 - - -} - -00000000800005bc
: -main(int argc,char *argv[]) -{ - 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15) - 800005c2: b9 04 00 1f lgr %r1,%r15 - 800005c6: a7 fb ff 60 aghi %r15,-160 - 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15) - return(test(5)); - 800005d0: a7 29 00 05 lghi %r2,5 - # brasl allows jumps > 64k & is overkill here bras would do fune - 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 - 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15) - 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15) - 800005e6: 07 f4 br %r4 -} - - - -Compiling programs for debugging on Linux for s/390 & z/Architecture -==================================================================== --gdwarf-2 now works it should be considered the default debugging -format for s/390 & z/Architecture as it is more reliable for debugging -shared libraries, normal -g debugging works much better now -Thanks to the IBM java compiler developers bug reports. - -This is typically done adding/appending the flags -g or -gdwarf-2 to the -CFLAGS & LDFLAGS variables Makefile of the program concerned. - -If using gdb & you would like accurate displays of registers & - stack traces compile without optimisation i.e make sure -that there is no -O2 or similar on the CFLAGS line of the Makefile & -the emitted gcc commands, obviously this will produce worse code -( not advisable for shipment ) but it is an aid to the debugging process. - -This aids debugging because the compiler will copy parameters passed in -in registers onto the stack so backtracing & looking at passed in -parameters will work, however some larger programs which use inline functions -will not compile without optimisation. - -Debugging with optimisation has since much improved after fixing -some bugs, please make sure you are using gdb-5.0 or later developed -after Nov'2000. - - - -Debugging under VM -================== - -Notes ------ -Addresses & values in the VM debugger are always hex never decimal -Address ranges are of the format - or -. -For example, the address range 0x2000 to 0x3000 can be described as 2000-3000 -or 2000.1000 - -The VM Debugger is case insensitive. - -VM's strengths are usually other debuggers weaknesses you can get at any -resource no matter how sensitive e.g. memory management resources, change -address translation in the PSW. For kernel hacking you will reap dividends if -you get good at it. - -The VM Debugger displays operators but not operands, and also the debugger -displays useful information on the same line as the author of the code probably -felt that it was a good idea not to go over the 80 columns on the screen. -This isn't as unintuitive as it may seem as the s/390 instructions are easy to -decode mentally and you can make a good guess at a lot of them as all the -operands are nibble (half byte aligned). -So if you have an objdump listing by hand, it is quite easy to follow, and if -you don't have an objdump listing keep a copy of the s/390 Reference Summary -or alternatively the s/390 principles of operation next to you. -e.g. even I can guess that -0001AFF8' LR 180F CC 0 -is a ( load register ) lr r0,r15 - -Also it is very easy to tell the length of a 390 instruction from the 2 most -significant bits in the instruction (not that this info is really useful except -if you are trying to make sense of a hexdump of code). -Here is a table -Bits Instruction Length ------------------------------------------- -00 2 Bytes -01 4 Bytes -10 4 Bytes -11 6 Bytes - -The debugger also displays other useful info on the same line such as the -addresses being operated on destination addresses of branches & condition codes. -e.g. -00019736' AHI A7DAFF0E CC 1 -000198BA' BRC A7840004 -> 000198C2' CC 0 -000198CE' STM 900EF068 >> 0FA95E78 CC 2 - - - -Useful VM debugger commands ---------------------------- - -I suppose I'd better mention this before I start -to list the current active traces do -Q TR -there can be a maximum of 255 of these per set -( more about trace sets later ). -To stop traces issue a -TR END. -To delete a particular breakpoint issue -TR DEL - -The PA1 key drops to CP mode so you can issue debugger commands, -Doing alt c (on my 3270 console at least ) clears the screen. -hitting b comes back to the running operating system -from cp mode ( in our case linux ). -It is typically useful to add shortcuts to your profile.exec file -if you have one ( this is roughly equivalent to autoexec.bat in DOS ). -file here are a few from mine. -/* this gives me command history on issuing f12 */ -set pf12 retrieve -/* this continues */ -set pf8 imm b -/* goes to trace set a */ -set pf1 imm tr goto a -/* goes to trace set b */ -set pf2 imm tr goto b -/* goes to trace set c */ -set pf3 imm tr goto c - - - -Instruction Tracing -------------------- -Setting a simple breakpoint -TR I PSWA
-To debug a particular function try -TR I R -TR I on its own will single step. -TR I DATA will trace for particular mnemonics -e.g. -TR I DATA 4D R 0197BC.4000 -will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000 -if you were inclined you could add traces for all branch instructions & -suffix them with the run prefix so you would have a backtrace on screen -when a program crashes. -TR BR will trace branches into or out of an address. -e.g. -TR BR INTO 0 is often quite useful if a program is getting awkward & deciding -to branch to 0 & crashing as this will stop at the address before in jumps to 0. -TR I R
RUN cmd d g -single steps a range of addresses but stays running & -displays the gprs on each step. - - - -Displaying & modifying Registers --------------------------------- -D G will display all the gprs -Adding a extra G to all the commands is necessary to access the full 64 bit -content in VM on z/Architecture. Obviously this isn't required for access -registers as these are still 32 bit. -e.g. DGG instead of DG -D X will display all the control registers -D AR will display all the access registers -D AR4-7 will display access registers 4 to 7 -CPU ALL D G will display the GRPS of all CPUS in the configuration -D PSW will display the current PSW -st PSW 2000 will put the value 2000 into the PSW & -cause crash your machine. -D PREFIX displays the prefix offset - - -Displaying Memory ------------------ -To display memory mapped using the current PSW's mapping try -D -To make VM display a message each time it hits a particular address and -continue try -D I will disassemble/display a range of instructions. -ST addr 32 bit word will store a 32 bit aligned address -D T will display the EBCDIC in an address (if you are that way inclined) -D R will display real addresses ( without DAT ) but with prefixing. -There are other complex options to display if you need to get at say home space -but are in primary space the easiest thing to do is to temporarily -modify the PSW to the other addressing mode, display the stuff & then -restore it. - - - -Hints ------ -If you want to issue a debugger command without halting your virtual machine -with the PA1 key try prefixing the command with #CP e.g. -#cp tr i pswa 2000 -also suffixing most debugger commands with RUN will cause them not -to stop just display the mnemonic at the current instruction on the console. -If you have several breakpoints you want to put into your program & -you get fed up of cross referencing with System.map -you can do the following trick for several symbols. -grep do_signal System.map -which emits the following among other things -0001f4e0 T do_signal -now you can do - -TR I PSWA 0001f4e0 cmd msg * do_signal -This sends a message to your own console each time do_signal is entered. -( As an aside I wrote a perl script once which automatically generated a REXX -script with breakpoints on every kernel procedure, this isn't a good idea -because there are thousands of these routines & VM can only set 255 breakpoints -at a time so you nearly had to spend as long pruning the file down as you would -entering the msgs by hand), however, the trick might be useful for a single -object file. In the 3270 terminal emulator x3270 there is a very useful option -in the file menu called "Save Screen In File" - this is very good for keeping a -copy of traces. - -From CMS help will give you online help on a particular command. -e.g. -HELP DISPLAY - -Also CP has a file called profile.exec which automatically gets called -on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session -CP has a feature similar to doskey, it may be useful for you to -use profile.exec to define some keystrokes. -e.g. -SET PF9 IMM B -This does a single step in VM on pressing F8. -SET PF10 ^ -This sets up the ^ key. -which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed directly -into some 3270 consoles. -SET PF11 ^- -This types the starting keystrokes for a sysrq see SysRq below. -SET PF12 RETRIEVE -This retrieves command history on pressing F12. - - -Sometimes in VM the display is set up to scroll automatically this -can be very annoying if there are messages you wish to look at -to stop this do -TERM MORE 255 255 -This will nearly stop automatic screen updates, however it will -cause a denial of service if lots of messages go to the 3270 console, -so it would be foolish to use this as the default on a production machine. - - -Tracing particular processes ----------------------------- -The kernel's text segment is intentionally at an address in memory that it will -very seldom collide with text segments of user programs ( thanks Martin ), -this simplifies debugging the kernel. -However it is quite common for user processes to have addresses which collide -this can make debugging a particular process under VM painful under normal -circumstances as the process may change when doing a -TR I R
. -Thankfully after reading VM's online help I figured out how to debug -I particular process. - -Your first problem is to find the STD ( segment table designation ) -of the program you wish to debug. -There are several ways you can do this here are a few -1) objdump --syms | grep main -To get the address of main in the program. -tr i pswa
-Start the program, if VM drops to CP on what looks like the entry -point of the main function this is most likely the process you wish to debug. -Now do a D X13 or D XG13 on z/Architecture. -On 31 bit the STD is bits 1-19 ( the STO segment table origin ) -& 25-31 ( the STL segment table length ) of CR13. -now type -TR I R STD 0.7fffffff -e.g. -TR I R STD 8F32E1FF 0.7fffffff -Another very useful variation is -TR STORE INTO STD
-for finding out when a particular variable changes. - -An alternative way of finding the STD of a currently running process -is to do the following, ( this method is more complex but -could be quite convenient if you aren't updating the kernel much & -so your kernel structures will stay constant for a reasonable period of -time ). - -grep task /proc//status -from this you should see something like -task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68 -This now gives you a pointer to the task structure. -Now make CC:="s390-gcc -g" kernel/sched.s -To get the task_struct stabinfo. -( task_struct is defined in include/linux/sched.h ). -Now we want to look at -task->active_mm->pgd -on my machine the active_mm in the task structure stab is -active_mm:(4,12),672,32 -its offset is 672/8=84=0x54 -the pgd member in the mm_struct stab is -pgd:(4,6)=*(29,5),96,32 -so its offset is 96/8=12=0xc - -so we'll -hexdump -s 0xf160054 /dev/mem | more -i.e. task_struct+active_mm offset -to look at the active_mm member -f160054 0fee cc60 0019 e334 0000 0000 0000 0011 -hexdump -s 0x0feecc6c /dev/mem | more -i.e. active_mm+pgd offset -feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010 -we get something like -now do -TR I R STD 0.7fffffff -i.e. the 0x7f is added because the pgd only -gives the page table origin & we need to set the low bits -to the maximum possible segment table length. -TR I R STD 0f2c007f 0.7fffffff -on z/Architecture you'll probably need to do -TR I R STD 0.ffffffffffffffff -to set the TableType to 0x1 & the Table length to 3. - - - -Tracing Program Exceptions --------------------------- -If you get a crash which says something like -illegal operation or specification exception followed by a register dump -You can restart linux & trace these using the tr prog trace -option. - - -The most common ones you will normally be tracing for is -1=operation exception -2=privileged operation exception -4=protection exception -5=addressing exception -6=specification exception -10=segment translation exception -11=page translation exception - -The full list of these is on page 22 of the current s/390 Reference Summary. -e.g. -tr prog 10 will trace segment translation exceptions. -tr prog on its own will trace all program interruption codes. - -Trace Sets ----------- -On starting VM you are initially in the INITIAL trace set. -You can do a Q TR to verify this. -If you have a complex tracing situation where you wish to wait for instance -till a driver is open before you start tracing IO, but know in your -heart that you are going to have to make several runs through the code till you -have a clue whats going on. - -What you can do is -TR I PSWA -hit b to continue till breakpoint -reach the breakpoint -now do your -TR GOTO B -TR IO 7c08-7c09 inst int run -or whatever the IO channels you wish to trace are & hit b - -To got back to the initial trace set do -TR GOTO INITIAL -& the TR I PSWA will be the only active breakpoint again. - - -Tracing linux syscalls under VM -------------------------------- -Syscalls are implemented on Linux for S390 by the Supervisor call instruction -(SVC). There 256 possibilities of these as the instruction is made up of a 0xA -opcode and the second byte being the syscall number. They are traced using the -simple command: -TR SVC -the syscalls are defined in linux/arch/s390/include/asm/unistd.h -e.g. to trace all file opens just do -TR SVC 5 ( as this is the syscall number of open ) - - -SMP Specific commands ---------------------- -To find out how many cpus you have -Q CPUS displays all the CPU's available to your virtual machine -To find the cpu that the current cpu VM debugger commands are being directed at -do Q CPU to change the current cpu VM debugger commands are being directed at do -CPU - -On a SMP guest issue a command to all CPUs try prefixing the command with cpu -all. To issue a command to a particular cpu try cpu e.g. -CPU 01 TR I R 2000.3000 -If you are running on a guest with several cpus & you have a IO related problem -& cannot follow the flow of code but you know it isn't smp related. -from the bash prompt issue -shutdown -h now or halt. -do a Q CPUS to find out how many cpus you have -detach each one of them from cp except cpu 0 -by issuing a -DETACH CPU 01-(number of cpus in configuration) -& boot linux again. -TR SIGP will trace inter processor signal processor instructions. -DEFINE CPU 01-(number in configuration) -will get your guests cpus back. - - -Help for displaying ascii textstrings -------------------------------------- -On the very latest VM Nucleus'es VM can now display ascii -( thanks Neale for the hint ) by doing -D TX. -e.g. -D TX0.100 - -Alternatively -============= -Under older VM debuggers (I love EBDIC too) you can use following little -program which converts a command line of hex digits to ascii text. It can be -compiled under linux and you can copy the hex digits from your x3270 terminal -to your xterm if you are debugging from a linuxbox. - -This is quite useful when looking at a parameter passed in as a text string -under VM ( unless you are good at decoding ASCII in your head ). - -e.g. consider tracing an open syscall -TR SVC 5 -We have stopped at a breakpoint -000151B0' SVC 0A05 -> 0001909A' CC 0 - -D 20.8 to check the SVC old psw in the prefix area and see was it from userspace -(for the layout of the prefix area consult the "Fixed Storage Locations" -chapter of the s/390 Reference Summary if you have it available). -V00000020 070C2000 800151B2 -The problem state bit wasn't set & it's also too early in the boot sequence -for it to be a userspace SVC if it was we would have to temporarily switch the -psw to user space addressing so we could get at the first parameter of the open -in gpr2. -Next do a -D G2 -GPR 2 = 00014CB4 -Now display what gpr2 is pointing to -D 00014CB4.20 -V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5 -V00014CC4 FC00014C B4001001 E0001000 B8070707 -Now copy the text till the first 00 hex ( which is the end of the string -to an xterm & do hex2ascii on it. -hex2ascii 2F646576 2F636F6E 736F6C65 00 -outputs -Decoded Hex:=/ d e v / c o n s o l e 0x00 -We were opening the console device, - -You can compile the code below yourself for practice :-), -/* - * hex2ascii.c - * a useful little tool for converting a hexadecimal command line to ascii - * - * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation. - */ -#include - -int main(int argc,char *argv[]) -{ - int cnt1,cnt2,len,toggle=0; - int startcnt=1; - unsigned char c,hex; - - if(argc>1&&(strcmp(argv[1],"-a")==0)) - startcnt=2; - printf("Decoded Hex:="); - for(cnt1=startcnt;cnt1='0'&&c<='9') - c=c-'0'; - if(c>='A'&&c<='F') - c=c-'A'+10; - if(c>='a'&&c<='f') - c=c-'a'+10; - switch(toggle) - { - case 0: - hex=c<<4; - toggle=1; - break; - case 1: - hex+=c; - if(hex<32||hex>127) - { - if(startcnt==1) - printf("0x%02X ",(int)hex); - else - printf("."); - } - else - { - printf("%c",hex); - if(startcnt==1) - printf(" "); - } - toggle=0; - break; - } - } - } - printf("\n"); -} - - - - -Stack tracing under VM ----------------------- -A basic backtrace ------------------ - -Here are the tricks I use 9 out of 10 times it works pretty well, - -When your backchain reaches a dead end --------------------------------------- -This can happen when an exception happens in the kernel and the kernel is -entered twice. If you reach the NULL pointer at the end of the back chain you -should be able to sniff further back if you follow the following tricks. -1) A kernel address should be easy to recognise since it is in -primary space & the problem state bit isn't set & also -The Hi bit of the address is set. -2) Another backchain should also be easy to recognise since it is an -address pointing to another address approximately 100 bytes or 0x70 hex -behind the current stackpointer. - - -Here is some practice. -boot the kernel & hit PA1 at some random time -d g to display the gprs, this should display something like -GPR 0 = 00000001 00156018 0014359C 00000000 -GPR 4 = 00000001 001B8888 000003E0 00000000 -GPR 8 = 00100080 00100084 00000000 000FE000 -GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8 -Note that GPR14 is a return address but as we are real men we are going to -trace the stack. -display 0x40 bytes after the stack pointer. - -V000FFED8 000FFF38 8001B838 80014C8E 000FFF38 -V000FFEE8 00000000 00000000 000003E0 00000000 -V000FFEF8 00100080 00100084 00000000 000FE000 -V000FFF08 00010400 8001B2DC 8001B36A 000FFED8 - - -Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if -you look above at our stackframe & also agrees with GPR14. - -now backchain -d 000FFF38.40 -we now are taking the contents of SP to get our first backchain. - -V000FFF38 000FFFA0 00000000 00014995 00147094 -V000FFF48 00147090 001470A0 000003E0 00000000 -V000FFF58 00100080 00100084 00000000 001BF1D0 -V000FFF68 00010400 800149BA 80014CA6 000FFF38 - -This displays a 2nd return address of 80014CA6 - -now do d 000FFFA0.40 for our 3rd backchain - -V000FFFA0 04B52002 0001107F 00000000 00000000 -V000FFFB0 00000000 00000000 FF000000 0001107F -V000FFFC0 00000000 00000000 00000000 00000000 -V000FFFD0 00010400 80010802 8001085A 000FFFA0 - - -our 3rd return address is 8001085A - -as the 04B52002 looks suspiciously like rubbish it is fair to assume that the -kernel entry routines for the sake of optimisation don't set up a backchain. - -now look at System.map to see if the addresses make any sense. - -grep -i 0001b3 System.map -outputs among other things -0001b304 T cpu_idle -so 8001B36A -is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it ) - - -grep -i 00014 System.map -produces among other things -00014a78 T start_kernel -so 0014CA6 is start_kernel+some hex number I can't add in my head. - -grep -i 00108 System.map -this produces -00010800 T _stext -so 8001085A is _stext+0x5a - -Congrats you've done your first backchain. - - - -s/390 & z/Architecture IO Overview -================================== - -I am not going to give a course in 390 IO architecture as this would take me -quite a while and I'm no expert. Instead I'll give a 390 IO architecture -summary for Dummies. If you have the s/390 principles of operation available -read this instead. If nothing else you may find a few useful keywords in here -and be able to use them on a web search engine to find more useful information. - -Unlike other bus architectures modern 390 systems do their IO using mostly -fibre optics and devices such as tapes and disks can be shared between several -mainframes. Also S390 can support up to 65536 devices while a high end PC based -system might be choking with around 64. - -Here is some of the common IO terminology: - -Subchannel: -This is the logical number most IO commands use to talk to an IO device. There -can be up to 0x10000 (65536) of these in a configuration, typically there are a -few hundred. Under VM for simplicity they are allocated contiguously, however -on the native hardware they are not. They typically stay consistent between -boots provided no new hardware is inserted or removed. -Under Linux for s390 we use these as IRQ's and also when issuing an IO command -(CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL, -START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID -of the device we wish to talk to. The most important of these instructions are -START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO -completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have -up to 8 channel paths to a device, this offers redundancy if one is not -available. - -Device Number: -This number remains static and is closely tied to the hardware. There are 65536 -of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and -another lsb 8 bits. These remain static even if more devices are inserted or -removed from the hardware. There is a 1 to 1 mapping between subchannels and -device numbers, provided devices aren't inserted or removed. - -Channel Control Words: -CCWs are linked lists of instructions initially pointed to by an operation -request block (ORB), which is initially given to Start Subchannel (SSCH) -command along with the subchannel number for the IO subsystem to process -while the CPU continues executing normal code. -CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and -Format 1 (31 bit). These are typically used to issue read and write (and many -other) instructions. They consist of a length field and an absolute address -field. -Each IO typically gets 1 or 2 interrupts, one for channel end (primary status) -when the channel is idle, and the second for device end (secondary status). -Sometimes you get both concurrently. You check how the IO went on by issuing a -TEST SUBCHANNEL at each interrupt, from which you receive an Interruption -response block (IRB). If you get channel and device end status in the IRB -without channel checks etc. your IO probably went okay. If you didn't you -probably need to examine the IRB, extended status word etc. -If an error occurs, more sophisticated control units have a facility known as -concurrent sense. This means that if an error occurs Extended sense information -will be presented in the Extended status word in the IRB. If not you have to -issue a subsequent SENSE CCW command after the test subchannel. - - -TPI (Test pending interrupt) can also be used for polled IO, but in -multitasking multiprocessor systems it isn't recommended except for -checking special cases (i.e. non looping checks for pending IO etc.). - -Store Subchannel and Modify Subchannel can be used to examine and modify -operating characteristics of a subchannel (e.g. channel paths). - -Other IO related Terms: -Sysplex: S390's Clustering Technology -QDIO: S390's new high speed IO architecture to support devices such as gigabit -ethernet, this architecture is also designed to be forward compatible with -upcoming 64 bit machines. - - -General Concepts - -Input Output Processors (IOP's) are responsible for communicating between -the mainframe CPU's & the channel & relieve the mainframe CPU's from the -burden of communicating with IO devices directly, this allows the CPU's to -concentrate on data processing. - -IOP's can use one or more links ( known as channel paths ) to talk to each -IO device. It first checks for path availability & chooses an available one, -then starts ( & sometimes terminates IO ). -There are two types of channel path: ESCON & the Parallel IO interface. - -IO devices are attached to control units, control units provide the -logic to interface the channel paths & channel path IO protocols to -the IO devices, they can be integrated with the devices or housed separately -& often talk to several similar devices ( typical examples would be raid -controllers or a control unit which connects to 1000 3270 terminals ). - - - +---------------------------------------------------------------+ - | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | - | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | | - | | | | | | | | | | Memory | | Storage | | - | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | - |---------------------------------------------------------------+ - | IOP | IOP | IOP | - |--------------------------------------------------------------- - | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | - ---------------------------------------------------------------- - || || - || Bus & Tag Channel Path || ESCON - || ====================== || Channel - || || || || Path - +----------+ +----------+ +----------+ - | | | | | | - | CU | | CU | | CU | - | | | | | | - +----------+ +----------+ +----------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -|I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device| -+----------+ +----------+ +----------+ +----------+ +----------+ - CPU = Central Processing Unit - C = Channel - IOP = IP Processor - CU = Control Unit - -The 390 IO systems come in 2 flavours the current 390 machines support both - -The Older 360 & 370 Interface,sometimes called the Parallel I/O interface, -sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers -Interface (OEMI). - -This byte wide Parallel channel path/bus has parity & data on the "Bus" cable -and control lines on the "Tag" cable. These can operate in byte multiplex mode -for sharing between several slow devices or burst mode and monopolize the -channel for the whole burst. Up to 256 devices can be addressed on one of these -cables. These cables are about one inch in diameter. The maximum unextended -length supported by these cables is 125 Meters but this can be extended up to -2km with a fibre optic channel extended such as a 3044. The maximum burst speed -supported is 4.5 megabytes per second. However, some really old processors -support only transfer rates of 3.0, 2.0 & 1.0 MB/sec. -One of these paths can be daisy chained to up to 8 control units. - - -ESCON if fibre optic it is also called FICON -Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or -lasers for communication at a signaling rate of up to 200 megabits/sec. As -10bits are transferred for every 8 bits info this drops to 160 megabits/sec -and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only -operates in burst mode. - -ESCONs typical max cable length is 3km for the led version and 20km for the -laser version known as XDF (extended distance facility). This can be further -extended by using an ESCON director which triples the above mentioned ranges. -Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture, -the standard Bus & Tag control protocol is however present within the packets. -Up to 256 devices can be attached to each control unit that uses one of these -interfaces. - -Common 390 Devices include: -Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters, -Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console). -DASD's direct access storage devices ( otherwise known as hard disks ). -Tape Drives. -CTC ( Channel to Channel Adapters ), -ESCON or Parallel Cables used as a very high speed serial link -between 2 machines. - - -Debugging IO on s/390 & z/Architecture under VM -=============================================== - -Now we are ready to go on with IO tracing commands under VM - -A few self explanatory queries: -Q OSA -Q CTC -Q DISK ( This command is CMS specific ) -Q DASD - - - - - - -Q OSA on my machine returns -OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000 -OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001 -OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002 -OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003 - -If you have a guest with certain privileges you may be able to see devices -which don't belong to you. To avoid this, add the option V. -e.g. -Q V OSA - -Now using the device numbers returned by this command we will -Trace the io starting up on the first device 7c08 & 7c09 -In our simplest case we can trace the -start subchannels -like TR SSCH 7C08-7C09 -or the halt subchannels -or TR HSCH 7C08-7C09 -MSCH's ,STSCH's I think you can guess the rest - -A good trick is tracing all the IO's and CCWS and spooling them into the reader -of another VM guest so he can ftp the logfile back to his own machine. I'll do -a small bit of this and give you a look at the output. - -1) Spool stdout to VM reader -SP PRT TO (another vm guest ) or * for the local vm guest -2) Fill the reader with the trace -TR IO 7c08-7c09 INST INT CCW PRT RUN -3) Start up linux -i 00c -4) Finish the trace -TR END -5) close the reader -C PRT -6) list reader contents -RDRLIST -7) copy it to linux4's minidisk -RECEIVE / LOG TXT A1 ( replace -8) -filel & press F11 to look at it -You should see something like: - -00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08 - CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80 - CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........ - IDAL 43D8AFE8 - IDAL 0FB76000 -00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4 -00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08 - CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC - KEY 0 FPI C0 CC 0 CTLS 4007 -00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08 - -If you don't like messing up your readed ( because you possibly booted from it ) -you can alternatively spool it to another readers guest. - - -Other common VM device related commands ---------------------------------------------- -These commands are listed only because they have -been of use to me in the past & may be of use to -you too. For more complete info on each of the commands -use type HELP from CMS. -detaching devices -DET -ATT -attach a device to guest * for your own guest -READY cause VM to issue a fake interrupt. - -The VARY command is normally only available to VM administrators. -VARY ON PATH TO -VARY OFF PATH FROM -This is used to switch on or off channel paths to devices. - -Q CHPID -This displays state of devices using this channel path -D SCHIB -This displays the subchannel information SCHIB block for the device. -this I believe is also only available to administrators. -DEFINE CTC -defines a virtual CTC channel to channel connection -2 need to be defined on each guest for the CTC driver to use. -COUPLE devno userid remote devno -Joins a local virtual device to a remote virtual device -( commonly used for the CTC driver ). - -Building a VM ramdisk under CMS which linux can use -def vfb- -blocksize is commonly 4096 for linux. -Formatting it -format (blksize - -Sharing a disk between multiple guests -LINK userid devno1 devno2 mode password - - - -GDB on S390 -=========== -N.B. if compiling for debugging gdb works better without optimisation -( see Compiling programs for debugging ) - -invocation ----------- -gdb - -Online help ------------ -help: gives help on commands -e.g. -help -help display -Note gdb's online help is very good use it. - - -Assembly --------- -info registers: displays registers other than floating point. -info all-registers: displays floating points as well. -disassemble: disassembles -e.g. -disassemble without parameters will disassemble the current function -disassemble $pc $pc+10 - -Viewing & modifying variables ------------------------------ -print or p: displays variable or register -e.g. p/x $sp will display the stack pointer - -display: prints variable or register each time program stops -e.g. -display/x $pc will display the program counter -display argc - -undisplay : undo's display's - -info breakpoints: shows all current breakpoints - -info stack: shows stack back trace (if this doesn't work too well, I'll show -you the stacktrace by hand below). - -info locals: displays local variables. - -info args: display current procedure arguments. - -set args: will set argc & argv each time the victim program is invoked. - -set =value -set argc=100 -set $pc=0 - - - -Modifying execution -------------------- -step: steps n lines of sourcecode -step steps 1 line. -step 100 steps 100 lines of code. - -next: like step except this will not step into subroutines - -stepi: steps a single machine code instruction. -e.g. stepi 100 - -nexti: steps a single machine code instruction but will not step into -subroutines. - -finish: will run until exit of the current routine - -run: (re)starts a program - -cont: continues a program - -quit: exits gdb. - - -breakpoints ------------- - -break -sets a breakpoint -e.g. - -break main - -break *$pc - -break *0x400618 - -Here's a really useful one for large programs -rbr -Set a breakpoint for all functions matching REGEXP -e.g. -rbr 390 -will set a breakpoint with all functions with 390 in their name. - -info breakpoints -lists all breakpoints - -delete: delete breakpoint by number or delete them all -e.g. -delete 1 will delete the first breakpoint -delete will delete them all - -watch: This will set a watchpoint ( usually hardware assisted ), -This will watch a variable till it changes -e.g. -watch cnt, will watch the variable cnt till it changes. -As an aside unfortunately gdb's, architecture independent watchpoint code -is inconsistent & not very good, watchpoints usually work but not always. - -info watchpoints: Display currently active watchpoints - -condition: ( another useful one ) -Specify breakpoint number N to break only if COND is true. -Usage is `condition N COND', where N is an integer and COND is an -expression to be evaluated whenever breakpoint N is reached. - - - -User defined functions/macros ------------------------------ -define: ( Note this is very very useful,simple & powerful ) -usage define end - -examples which you should consider putting into .gdbinit in your home directory -define d -stepi -disassemble $pc $pc+10 -end - -define e -nexti -disassemble $pc $pc+10 -end - - -Other hard to classify stuff ----------------------------- -signal n: -sends the victim program a signal. -e.g. signal 3 will send a SIGQUIT. - -info signals: -what gdb does when the victim receives certain signals. - -list: -e.g. -list lists current function source -list 1,10 list first 10 lines of current file. -list test.c:1,10 - - -directory: -Adds directories to be searched for source if gdb cannot find the source. -(note it is a bit sensitive about slashes) -e.g. To add the root of the filesystem to the searchpath do -directory // - - -call -This calls a function in the victim program, this is pretty powerful -e.g. -(gdb) call printf("hello world") -outputs: -$1 = 11 - -You might now be thinking that the line above didn't work, something extra had -to be done. -(gdb) call fflush(stdout) -hello world$2 = 0 -As an aside the debugger also calls malloc & free under the hood -to make space for the "hello world" string. - - - -hints ------ -1) command completion works just like bash -( if you are a bad typist like me this really helps ) -e.g. hit br & cursor up & down :-). - -2) if you have a debugging problem that takes a few steps to recreate -put the steps into a file called .gdbinit in your current working directory -if you have defined a few extra useful user defined commands put these in -your home directory & they will be read each time gdb is launched. - -A typical .gdbinit file might be. -break main -run -break runtime_exception -cont - - -stack chaining in gdb by hand ------------------------------ -This is done using a the same trick described for VM -p/x (*($sp+56))&0x7fffffff get the first backchain. - -For z/Architecture -Replace 56 with 112 & ignore the &0x7fffffff -in the macros below & do nasty casts to longs like the following -as gdb unfortunately deals with printed arguments as ints which -messes up everything. -i.e. here is a 3rd backchain dereference -p/x *(long *)(***(long ***)$sp+112) - - -this outputs -$5 = 0x528f18 -on my machine. -Now you can use -info symbol (*($sp+56))&0x7fffffff -you might see something like. -rl_getc + 36 in section .text telling you what is located at address 0x528f18 -Now do. -p/x (*(*$sp+56))&0x7fffffff -This outputs -$6 = 0x528ed0 -Now do. -info symbol (*(*$sp+56))&0x7fffffff -rl_read_key + 180 in section .text -now do -p/x (*(**$sp+56))&0x7fffffff -& so on. - -Disassembling instructions without debug info ---------------------------------------------- -gdb typically complains if there is a lack of debugging -symbols in the disassemble command with -"No function contains specified address." To get around -this do -x/xi
-e.g. -x/20xi 0x400730 - - - -Note: Remember gdb has history just like bash you don't need to retype the -whole line just use the up & down arrows. - - - -For more info -------------- -From your linuxbox do -man gdb or info gdb. - -core dumps ----------- -What a core dump ?, -A core dump is a file generated by the kernel (if allowed) which contains the -registers and all active pages of the program which has crashed. -From this file gdb will allow you to look at the registers, stack trace and -memory of the program as if it just crashed on your system. It is usually -called core and created in the current working directory. -This is very useful in that a customer can mail a core dump to a technical -support department and the technical support department can reconstruct what -happened. Provided they have an identical copy of this program with debugging -symbols compiled in and the source base of this build is available. -In short it is far more useful than something like a crash log could ever hope -to be. - -Why have I never seen one ?. -Probably because you haven't used the command -ulimit -c unlimited in bash -to allow core dumps, now do -ulimit -a -to verify that the limit was accepted. - -A sample core dump -To create this I'm going to do -ulimit -c unlimited -gdb -to launch gdb (my victim app. ) now be bad & do the following from another -telnet/xterm session to the same machine -ps -aux | grep gdb -kill -SIGSEGV -or alternatively use killall -SIGSEGV gdb if you have the killall command. -Now look at the core dump. -./gdb core -Displays the following -GNU gdb 4.18 -Copyright 1998 Free Software Foundation, Inc. -GDB is free software, covered by the GNU General Public License, and you are -welcome to change it and/or distribute copies of it under certain conditions. -Type "show copying" to see the conditions. -There is absolutely no warranty for GDB. Type "show warranty" for details. -This GDB was configured as "s390-ibm-linux"... -Core was generated by `./gdb'. -Program terminated with signal 11, Segmentation fault. -Reading symbols from /usr/lib/libncurses.so.4...done. -Reading symbols from /lib/libm.so.6...done. -Reading symbols from /lib/libc.so.6...done. -Reading symbols from /lib/ld-linux.so.2...done. -#0 0x40126d1a in read () from /lib/libc.so.6 -Setting up the environment for debugging gdb. -Breakpoint 1 at 0x4dc6f8: file utils.c, line 471. -Breakpoint 2 at 0x4d87a4: file top.c, line 2609. -(top-gdb) info stack -#0 0x40126d1a in read () from /lib/libc.so.6 -#1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402 -#2 0x528ed0 in rl_read_key () at input.c:381 -#3 0x5167e6 in readline_internal_char () at readline.c:454 -#4 0x5168ee in readline_internal_charloop () at readline.c:507 -#5 0x51692c in readline_internal () at readline.c:521 -#6 0x5164fe in readline (prompt=0x7ffff810) - at readline.c:349 -#7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1, - annotation_suffix=0x4d6b44 "prompt") at top.c:2091 -#8 0x4d6cf0 in command_loop () at top.c:1345 -#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 - - -LDD -=== -This is a program which lists the shared libraries which a library needs, -Note you also get the relocations of the shared library text segments which -help when using objdump --source. -e.g. - ldd ./gdb -outputs -libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000) -libm.so.6 => /lib/libm.so.6 (0x4005e000) -libc.so.6 => /lib/libc.so.6 (0x40084000) -/lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000) - - -Debugging shared libraries -========================== -Most programs use shared libraries, however it can be very painful -when you single step instruction into a function like printf for the -first time & you end up in functions like _dl_runtime_resolve this is -the ld.so doing lazy binding, lazy binding is a concept in ELF where -shared library functions are not loaded into memory unless they are -actually used, great for saving memory but a pain to debug. -To get around this either relink the program -static or exit gdb type -export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing -the program in question. - - - -Debugging modules -================= -As modules are dynamically loaded into the kernel their address can be -anywhere to get around this use the -m option with insmod to emit a load -map which can be piped into a file if required. - -The proc file system -==================== -What is it ?. -It is a filesystem created by the kernel with files which are created on demand -by the kernel if read, or can be used to modify kernel parameters, -it is a powerful concept. - -e.g. - -cat /proc/sys/net/ipv4/ip_forward -On my machine outputs -0 -telling me ip_forwarding is not on to switch it on I can do -echo 1 > /proc/sys/net/ipv4/ip_forward -cat it again -cat /proc/sys/net/ipv4/ip_forward -On my machine now outputs -1 -IP forwarding is on. -There is a lot of useful info in here best found by going in and having a look -around, so I'll take you through some entries I consider important. - -All the processes running on the machine have their own entry defined by -/proc/ -So lets have a look at the init process -cd /proc/1 - -cat cmdline -emits -init [2] - -cd /proc/1/fd -This contains numerical entries of all the open files, -some of these you can cat e.g. stdout (2) - -cat /proc/29/maps -on my machine emits - -00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash -00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash -0047e000-00492000 rwxp 00000000 00:00 0 -40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so -40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so -40016000-40017000 rwxp 00000000 00:00 0 -40017000-40018000 rw-p 00000000 00:00 0 -40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8 -4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8 -4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so -4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so -40111000-40114000 rw-p 00000000 00:00 0 -40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so -4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so -7fffd000-80000000 rwxp ffffe000 00:00 0 - - -Showing us the shared libraries init uses where they are in memory -& memory access permissions for each virtual memory area. - -/proc/1/cwd is a softlink to the current working directory. -/proc/1/root is the root of the filesystem for this process. - -/proc/1/mem is the current running processes memory which you -can read & write to like a file. -strace uses this sometimes as it is a bit faster than the -rather inefficient ptrace interface for peeking at DATA. - - -cat status - -Name: init -State: S (sleeping) -Pid: 1 -PPid: 0 -Uid: 0 0 0 0 -Gid: 0 0 0 0 -Groups: -VmSize: 408 kB -VmLck: 0 kB -VmRSS: 208 kB -VmData: 24 kB -VmStk: 8 kB -VmExe: 368 kB -VmLib: 0 kB -SigPnd: 0000000000000000 -SigBlk: 0000000000000000 -SigIgn: 7fffffffd7f0d8fc -SigCgt: 00000000280b2603 -CapInh: 00000000fffffeff -CapPrm: 00000000ffffffff -CapEff: 00000000fffffeff - -User PSW: 070de000 80414146 -task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68 -User GPRS: -00000400 00000000 0000000b 7ffffa90 -00000000 00000000 00000000 0045d9f4 -0045cafc 7ffffa90 7fffff18 0045cb08 -00010400 804039e8 80403af8 7ffff8b0 -User ACRS: -00000000 00000000 00000000 00000000 -00000001 00000000 00000000 00000000 -00000000 00000000 00000000 00000000 -00000000 00000000 00000000 00000000 -Kernel BackChain CallChain BackChain CallChain - 004b7ca8 8002bd0c 004b7d18 8002b92c - 004b7db8 8005cd50 004b7e38 8005d12a - 004b7f08 80019114 -Showing among other things memory usage & status of some signals & -the processes'es registers from the kernel task_structure -as well as a backchain which may be useful if a process crashes -in the kernel for some unknown reason. - -Some driver debugging techniques -================================ -debug feature -------------- -Some of our drivers now support a "debug feature" in -/proc/s390dbf see s390dbf.txt in the linux/Documentation directory -for more info. -e.g. -to switch on the lcs "debug feature" -echo 5 > /proc/s390dbf/lcs/level -& then after the error occurred. -cat /proc/s390dbf/lcs/sprintf >/logfile -the logfile now contains some information which may help -tech support resolve a problem in the field. - - - -high level debugging network drivers ------------------------------------- -ifconfig is a quite useful command -it gives the current state of network drivers. - -If you suspect your network device driver is dead -one way to check is type -ifconfig -e.g. tr0 -You should see something like -tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48 - inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0 - UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1 - RX packets:246134 errors:0 dropped:0 overruns:0 frame:0 - TX packets:5 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:100 - -if the device doesn't say up -try -/etc/rc.d/init.d/network start -( this starts the network stack & hopefully calls ifconfig tr0 up ). -ifconfig looks at the output of /proc/net/dev and presents it in a more -presentable form. -Now ping the device from a machine in the same subnet. -if the RX packets count & TX packets counts don't increment you probably -have problems. -next -cat /proc/net/arp -Do you see any hardware addresses in the cache if not you may have problems. -Next try -ping -c 5 i.e. the Bcast field above in the output of -ifconfig. Do you see any replies from machines other than the local machine -if not you may have problems. also if the TX packets count in ifconfig -hasn't incremented either you have serious problems in your driver -(e.g. the txbusy field of the network device being stuck on ) -or you may have multiple network devices connected. - - -chandev -------- -There is a new device layer for channel devices, some -drivers e.g. lcs are registered with this layer. -If the device uses the channel device layer you'll be -able to find what interrupts it uses & the current state -of the device. -See the manpage chandev.8 &type cat /proc/chandev for more info. - - -SysRq -===== -This is now supported by linux for s/390 & z/Architecture. -To enable it do compile the kernel with -Kernel Hacking -> Magic SysRq Key Enabled -echo "1" > /proc/sys/kernel/sysrq -also type -echo "8" >/proc/sys/kernel/printk -To make printk output go to console. -On 390 all commands are prefixed with -^- -e.g. -^-t will show tasks. -^-? or some unknown command will display help. -The sysrq key reading is very picky ( I have to type the keys in an - xterm session & paste them into the x3270 console ) -& it may be wise to predefine the keys as described in the VM hints above - -This is particularly useful for syncing disks unmounting & rebooting -if the machine gets partially hung. - -Read Documentation/admin-guide/sysrq.rst for more info - -References: -=========== -Enterprise Systems Architecture Reference Summary -Enterprise Systems Architecture Principles of Operation -Hartmut Penners s390 stack frame sheet. -IBM Mainframe Channel Attachment a technology brief from a CISCO webpage -Various bits of man & info pages of Linux. -Linux & GDB source. -Various info & man pages. -CMS Help on tracing commands. -Linux for s/390 Elf Application Binary Interface -Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended ) -z/Architecture Principles of Operation SA22-7832-00 -Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the -Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05 - -Special Thanks -============== -Special thanks to Neale Ferguson who maintains a much -prettier HTML version of this page at -http://linuxvm.org/penguinvm/ -Bob Grainger Stefan Bader & others for reporting bugs diff --git a/Documentation/s390/cds.rst b/Documentation/s390/cds.rst new file mode 100644 index 000000000000..7006d8209d2e --- /dev/null +++ b/Documentation/s390/cds.rst @@ -0,0 +1,530 @@ +=========================== +Linux for S/390 and zSeries +=========================== + +Common Device Support (CDS) +Device Driver I/O Support Routines + +Authors: + - Ingo Adlung + - Cornelia Huck + +Copyright, IBM Corp. 1999-2002 + +Introduction +============ + +This document describes the common device support routines for Linux/390. +Different than other hardware architectures, ESA/390 has defined a unified +I/O access method. This gives relief to the device drivers as they don't +have to deal with different bus types, polling versus interrupt +processing, shared versus non-shared interrupt processing, DMA versus port +I/O (PIO), and other hardware features more. However, this implies that +either every single device driver needs to implement the hardware I/O +attachment functionality itself, or the operating system provides for a +unified method to access the hardware, providing all the functionality that +every single device driver would have to provide itself. + +The document does not intend to explain the ESA/390 hardware architecture in +every detail.This information can be obtained from the ESA/390 Principles of +Operation manual (IBM Form. No. SA22-7201). + +In order to build common device support for ESA/390 I/O interfaces, a +functional layer was introduced that provides generic I/O access methods to +the hardware. + +The common device support layer comprises the I/O support routines defined +below. Some of them implement common Linux device driver interfaces, while +some of them are ESA/390 platform specific. + +Note: + In order to write a driver for S/390, you also need to look into the interface + described in Documentation/s390/driver-model.rst. + +Note for porting drivers from 2.4: + +The major changes are: + +* The functions use a ccw_device instead of an irq (subchannel). +* All drivers must define a ccw_driver (see driver-model.txt) and the associated + functions. +* request_irq() and free_irq() are no longer done by the driver. +* The oper_handler is (kindof) replaced by the probe() and set_online() functions + of the ccw_driver. +* The not_oper_handler is (kindof) replaced by the remove() and set_offline() + functions of the ccw_driver. +* The channel device layer is gone. +* The interrupt handlers must be adapted to use a ccw_device as argument. + Moreover, they don't return a devstat, but an irb. +* Before initiating an io, the options must be set via ccw_device_set_options(). +* Instead of calling read_dev_chars()/read_conf_data(), the driver issues + the channel program and handles the interrupt itself. + +ccw_device_get_ciw() + get commands from extended sense data. + +ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout() + initiate an I/O request. + +ccw_device_resume() + resume channel program execution. + +ccw_device_halt() + terminate the current I/O request processed on the device. + +do_IRQ() + generic interrupt routine. This function is called by the interrupt entry + routine whenever an I/O interrupt is presented to the system. The do_IRQ() + routine determines the interrupt status and calls the device specific + interrupt handler according to the rules (flags) defined during I/O request + initiation with do_IO(). + +The next chapters describe the functions other than do_IRQ() in more details. +The do_IRQ() interface is not described, as it is called from the Linux/390 +first level interrupt handler only and does not comprise a device driver +callable interface. Instead, the functional description of do_IO() also +describes the input to the device specific interrupt handler. + +Note: + All explanations apply also to the 64 bit architecture s390x. + + +Common Device Support (CDS) for Linux/390 Device Drivers +======================================================== + +General Information +------------------- + +The following chapters describe the I/O related interface routines the +Linux/390 common device support (CDS) provides to allow for device specific +driver implementations on the IBM ESA/390 hardware platform. Those interfaces +intend to provide the functionality required by every device driver +implementation to allow to drive a specific hardware device on the ESA/390 +platform. Some of the interface routines are specific to Linux/390 and some +of them can be found on other Linux platforms implementations too. +Miscellaneous function prototypes, data declarations, and macro definitions +can be found in the architecture specific C header file +linux/arch/s390/include/asm/irq.h. + +Overview of CDS interface concepts +---------------------------------- + +Different to other hardware platforms, the ESA/390 architecture doesn't define +interrupt lines managed by a specific interrupt controller and bus systems +that may or may not allow for shared interrupts, DMA processing, etc.. Instead, +the ESA/390 architecture has implemented a so called channel subsystem, that +provides a unified view of the devices physically attached to the systems. +Though the ESA/390 hardware platform knows about a huge variety of different +peripheral attachments like disk devices (aka. DASDs), tapes, communication +controllers, etc. they can all be accessed by a well defined access method and +they are presenting I/O completion a unified way : I/O interruptions. Every +single device is uniquely identified to the system by a so called subchannel, +where the ESA/390 architecture allows for 64k devices be attached. + +Linux, however, was first built on the Intel PC architecture, with its two +cascaded 8259 programmable interrupt controllers (PICs), that allow for a +maximum of 15 different interrupt lines. All devices attached to such a system +share those 15 interrupt levels. Devices attached to the ISA bus system must +not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered +interrupts. MCA, EISA, PCI and other bus systems base on level triggered +interrupts, and therewith allow for shared IRQs. However, if multiple devices +present their hardware status by the same (shared) IRQ, the operating system +has to call every single device driver registered on this IRQ in order to +determine the device driver owning the device that raised the interrupt. + +Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel). +For internal use of the common I/O layer, these are still there. However, +device drivers should use the new calling interface via the ccw_device only. + +During its startup the Linux/390 system checks for peripheral devices. Each +of those devices is uniquely defined by a so called subchannel by the ESA/390 +channel subsystem. While the subchannel numbers are system generated, each +subchannel also takes a user defined attribute, the so called device number. +Both subchannel number and device number cannot exceed 65535. During sysfs +initialisation, the information about control unit type and device types that +imply specific I/O commands (channel command words - CCWs) in order to operate +the device are gathered. Device drivers can retrieve this set of hardware +information during their initialization step to recognize the devices they +support using the information saved in the struct ccw_device given to them. +This methods implies that Linux/390 doesn't require to probe for free (not +armed) interrupt request lines (IRQs) to drive its devices with. Where +applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS +ccw to retrieve device characteristics in its online routine. + +In order to allow for easy I/O initiation the CDS layer provides a +ccw_device_start() interface that takes a device specific channel program (one +or more CCWs) as input sets up the required architecture specific control blocks +and initiates an I/O request on behalf of the device driver. The +ccw_device_start() routine allows to specify whether it expects the CDS layer +to notify the device driver for every interrupt it observes, or with final status +only. See ccw_device_start() for more details. A device driver must never issue +ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead. + +For long running I/O request to be canceled, the CDS layer provides the +ccw_device_halt() function. Some devices require to initially issue a HALT +SUBCHANNEL (HSCH) command without having pending I/O requests. This function is +also covered by ccw_device_halt(). + + +get_ciw() - get command information word + +This call enables a device driver to get information about supported commands +from the extended SenseID data. + +:: + + struct ciw * + ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); + +==== ======================================================== +cdev The ccw_device for which the command is to be retrieved. +cmd The command type to be retrieved. +==== ======================================================== + +ccw_device_get_ciw() returns: + +===== ================================================================ + NULL No extended data available, invalid device or command not found. +!NULL The command requested. +===== ================================================================ + +:: + + ccw_device_start() - Initiate I/O Request + +The ccw_device_start() routines is the I/O request front-end processor. All +device driver I/O requests must be issued using this routine. A device driver +must not issue ESA/390 I/O commands itself. Instead the ccw_device_start() +routine provides all interfaces required to drive arbitrary devices. + +This description also covers the status information passed to the device +driver's interrupt handler as this is related to the rules (flags) defined +with the associated I/O request when calling ccw_device_start(). + +:: + + int ccw_device_start(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags); + int ccw_device_start_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags, + int expires); + int ccw_device_start_key(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags); + int ccw_device_start_key_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags, + int expires); + +============= ============================================================= +cdev ccw_device the I/O is destined for +cpa logical start address of channel program +user_intparm user specific interrupt information; will be presented + back to the device driver's interrupt handler. Allows a + device driver to associate the interrupt with a + particular I/O request. +lpm defines the channel path to be used for a specific I/O + request. A value of 0 will make cio use the opm. +key the storage key to use for the I/O (useful for operating on a + storage with a storage key != default key) +flag defines the action to be performed for I/O processing +expires timeout value in jiffies. The common I/O layer will terminate + the running program after this and call the interrupt handler + with ERR_PTR(-ETIMEDOUT) as irb. +============= ============================================================= + +Possible flag values are: + +========================= ============================================= +DOIO_ALLOW_SUSPEND channel program may become suspended +DOIO_DENY_PREFETCH don't allow for CCW prefetch; usually + this implies the channel program might + become modified +DOIO_SUPPRESS_INTER don't call the handler on intermediate status +========================= ============================================= + +The cpa parameter points to the first format 1 CCW of a channel program:: + + struct ccw1 { + __u8 cmd_code;/* command code */ + __u8 flags; /* flags, like IDA addressing, etc. */ + __u16 count; /* byte count */ + __u32 cda; /* data address */ + } __attribute__ ((packed,aligned(8))); + +with the following CCW flags values defined: + +=================== ========================= +CCW_FLAG_DC data chaining +CCW_FLAG_CC command chaining +CCW_FLAG_SLI suppress incorrect length +CCW_FLAG_SKIP skip +CCW_FLAG_PCI PCI +CCW_FLAG_IDA indirect addressing +CCW_FLAG_SUSPEND suspend +=================== ========================= + + +Via ccw_device_set_options(), the device driver may specify the following +options for the device: + +========================= ====================================== +DOIO_EARLY_NOTIFICATION allow for early interrupt notification +DOIO_REPORT_ALL report all interrupt conditions +========================= ====================================== + + +The ccw_device_start() function returns: + +======== ====================================================================== + 0 successful completion or request successfully initiated + -EBUSY The device is currently processing a previous I/O request, or there is + a status pending at the device. +-ENODEV cdev is invalid, the device is not operational or the ccw_device is + not online. +======== ====================================================================== + +When the I/O request completes, the CDS first level interrupt handler will +accumulate the status in a struct irb and then call the device interrupt handler. +The intparm field will contain the value the device driver has associated with a +particular I/O request. If a pending device status was recognized, +intparm will be set to 0 (zero). This may happen during I/O initiation or delayed +by an alert status notification. In any case this status is not related to the +current (last) I/O request. In case of a delayed status notification no special +interrupt will be presented to indicate I/O completion as the I/O request was +never started, even though ccw_device_start() returned with successful completion. + +The irb may contain an error value, and the device driver should check for this +first: + +========== ================================================================= +-ETIMEDOUT the common I/O layer terminated the request after the specified + timeout value +-EIO the common I/O layer terminated the request due to an error state +========== ================================================================= + +If the concurrent sense flag in the extended status word (esw) in the irb is +set, the field erw.scnt in the esw describes the number of device specific +sense bytes available in the extended control word irb->scsw.ecw[]. No device +sensing by the device driver itself is required. + +The device interrupt handler can use the following definitions to investigate +the primary unit check source coded in sense byte 0 : + +======================= ==== +SNS0_CMD_REJECT 0x80 +SNS0_INTERVENTION_REQ 0x40 +SNS0_BUS_OUT_CHECK 0x20 +SNS0_EQUIPMENT_CHECK 0x10 +SNS0_DATA_CHECK 0x08 +SNS0_OVERRUN 0x04 +SNS0_INCOMPL_DOMAIN 0x01 +======================= ==== + +Depending on the device status, multiple of those values may be set together. +Please refer to the device specific documentation for details. + +The irb->scsw.cstat field provides the (accumulated) subchannel status : + +========================= ============================ +SCHN_STAT_PCI program controlled interrupt +SCHN_STAT_INCORR_LEN incorrect length +SCHN_STAT_PROG_CHECK program check +SCHN_STAT_PROT_CHECK protection check +SCHN_STAT_CHN_DATA_CHK channel data check +SCHN_STAT_CHN_CTRL_CHK channel control check +SCHN_STAT_INTF_CTRL_CHK interface control check +SCHN_STAT_CHAIN_CHECK chaining check +========================= ============================ + +The irb->scsw.dstat field provides the (accumulated) device status : + +===================== ================= +DEV_STAT_ATTENTION attention +DEV_STAT_STAT_MOD status modifier +DEV_STAT_CU_END control unit end +DEV_STAT_BUSY busy +DEV_STAT_CHN_END channel end +DEV_STAT_DEV_END device end +DEV_STAT_UNIT_CHECK unit check +DEV_STAT_UNIT_EXCEP unit exception +===================== ================= + +Please see the ESA/390 Principles of Operation manual for details on the +individual flag meanings. + +Usage Notes: + +ccw_device_start() must be called disabled and with the ccw device lock held. + +The device driver is allowed to issue the next ccw_device_start() call from +within its interrupt handler already. It is not required to schedule a +bottom-half, unless a non deterministically long running error recovery procedure +or similar needs to be scheduled. During I/O processing the Linux/390 generic +I/O device driver support has already obtained the IRQ lock, i.e. the handler +must not try to obtain it again when calling ccw_device_start() or we end in a +deadlock situation! + +If a device driver relies on an I/O request to be completed prior to start the +next it can reduce I/O processing overhead by chaining a NoOp I/O command +CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End +and Device-End status to be presented together, with a single interrupt. +However, this should be used with care as it implies the channel will remain +busy, not being able to process I/O requests for other devices on the same +channel. Therefore e.g. read commands should never use this technique, as the +result will be presented by a single interrupt anyway. + +In order to minimize I/O overhead, a device driver should use the +DOIO_REPORT_ALL only if the device can report intermediate interrupt +information prior to device-end the device driver urgently relies on. In this +case all I/O interruptions are presented to the device driver until final +status is recognized. + +If a device is able to recover from asynchronously presented I/O errors, it can +perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some +devices always report channel-end and device-end together, with a single +interrupt, others present primary status (channel-end) when the channel is +ready for the next I/O request and secondary status (device-end) when the data +transmission has been completed at the device. + +Above flag allows to exploit this feature, e.g. for communication devices that +can handle lost data on the network to allow for enhanced I/O processing. + +Unless the channel subsystem at any time presents a secondary status interrupt, +exploiting this feature will cause only primary status interrupts to be +presented to the device driver while overlapping I/O is performed. When a +secondary status without error (alert status) is presented, this indicates +successful completion for all overlapping ccw_device_start() requests that have +been issued since the last secondary (final) status. + +Channel programs that intend to set the suspend flag on a channel command word +(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the +suspend flag will cause a channel program check. At the time the channel program +becomes suspended an intermediate interrupt will be generated by the channel +subsystem. + +ccw_device_resume() - Resume Channel Program Execution + +If a device driver chooses to suspend the current channel program execution by +setting the CCW suspend flag on a particular CCW, the channel program execution +is suspended. In order to resume channel program execution the CIO layer +provides the ccw_device_resume() routine. + +:: + + int ccw_device_resume(struct ccw_device *cdev); + +==== ================================================ +cdev ccw_device the resume operation is requested for +==== ================================================ + +The ccw_device_resume() function returns: + +========= ============================================== + 0 suspended channel program is resumed + -EBUSY status pending + -ENODEV cdev invalid or not-operational subchannel + -EINVAL resume function not applicable +-ENOTCONN there is no I/O request pending for completion +========= ============================================== + +Usage Notes: + +Please have a look at the ccw_device_start() usage notes for more details on +suspended channel programs. + +ccw_device_halt() - Halt I/O Request Processing + +Sometimes a device driver might need a possibility to stop the processing of +a long-running channel program or the device might require to initially issue +a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt() +command is provided. + +ccw_device_halt() must be called disabled and with the ccw device lock held. + +:: + + int ccw_device_halt(struct ccw_device *cdev, + unsigned long intparm); + +======= ===================================================== +cdev ccw_device the halt operation is requested for +intparm interruption parameter; value is only used if no I/O + is outstanding, otherwise the intparm associated with + the I/O request is returned +======= ===================================================== + +The ccw_device_halt() function returns: + +======= ============================================================== + 0 request successfully initiated +-EBUSY the device is currently busy, or status pending. +-ENODEV cdev invalid. +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== + +Usage Notes: + +A device driver may write a never-ending channel program by writing a channel +program that at its end loops back to its beginning by means of a transfer in +channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network +device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is +executed a program controlled interrupt (PCI) is generated. The device driver +can then perform an appropriate action. Prior to interrupt of an outstanding +read to a network device (with or without PCI flag) a ccw_device_halt() +is required to end the pending operation. + +:: + + ccw_device_clear() - Terminage I/O Request Processing + +In order to terminate all I/O processing at the subchannel, the clear subchannel +(CSCH) command is used. It can be issued via ccw_device_clear(). + +ccw_device_clear() must be called disabled and with the ccw device lock held. + +:: + + int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); + +======= =============================================== +cdev ccw_device the clear operation is requested for +intparm interruption parameter (see ccw_device_halt()) +======= =============================================== + +The ccw_device_clear() function returns: + +======= ============================================================== + 0 request successfully initiated +-ENODEV cdev invalid +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== + +Miscellaneous Support Routines +------------------------------ + +This chapter describes various routines to be used in a Linux/390 device +driver programming environment. + +get_ccwdev_lock() + +Get the address of the device specific lock. This is then used in +spin_lock() / spin_unlock() calls. + +:: + + __u8 ccw_device_get_path_mask(struct ccw_device *cdev); + +Get the mask of the path currently available for cdev. diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt deleted file mode 100644 index 480a78ef5a1e..000000000000 --- a/Documentation/s390/cds.txt +++ /dev/null @@ -1,472 +0,0 @@ -Linux for S/390 and zSeries - -Common Device Support (CDS) -Device Driver I/O Support Routines - -Authors : Ingo Adlung - Cornelia Huck - -Copyright, IBM Corp. 1999-2002 - -Introduction - -This document describes the common device support routines for Linux/390. -Different than other hardware architectures, ESA/390 has defined a unified -I/O access method. This gives relief to the device drivers as they don't -have to deal with different bus types, polling versus interrupt -processing, shared versus non-shared interrupt processing, DMA versus port -I/O (PIO), and other hardware features more. However, this implies that -either every single device driver needs to implement the hardware I/O -attachment functionality itself, or the operating system provides for a -unified method to access the hardware, providing all the functionality that -every single device driver would have to provide itself. - -The document does not intend to explain the ESA/390 hardware architecture in -every detail.This information can be obtained from the ESA/390 Principles of -Operation manual (IBM Form. No. SA22-7201). - -In order to build common device support for ESA/390 I/O interfaces, a -functional layer was introduced that provides generic I/O access methods to -the hardware. - -The common device support layer comprises the I/O support routines defined -below. Some of them implement common Linux device driver interfaces, while -some of them are ESA/390 platform specific. - -Note: -In order to write a driver for S/390, you also need to look into the interface -described in Documentation/s390/driver-model.txt. - -Note for porting drivers from 2.4: -The major changes are: -* The functions use a ccw_device instead of an irq (subchannel). -* All drivers must define a ccw_driver (see driver-model.txt) and the associated - functions. -* request_irq() and free_irq() are no longer done by the driver. -* The oper_handler is (kindof) replaced by the probe() and set_online() functions - of the ccw_driver. -* The not_oper_handler is (kindof) replaced by the remove() and set_offline() - functions of the ccw_driver. -* The channel device layer is gone. -* The interrupt handlers must be adapted to use a ccw_device as argument. - Moreover, they don't return a devstat, but an irb. -* Before initiating an io, the options must be set via ccw_device_set_options(). -* Instead of calling read_dev_chars()/read_conf_data(), the driver issues - the channel program and handles the interrupt itself. - -ccw_device_get_ciw() - get commands from extended sense data. - -ccw_device_start() -ccw_device_start_timeout() -ccw_device_start_key() -ccw_device_start_key_timeout() - initiate an I/O request. - -ccw_device_resume() - resume channel program execution. - -ccw_device_halt() - terminate the current I/O request processed on the device. - -do_IRQ() - generic interrupt routine. This function is called by the interrupt entry - routine whenever an I/O interrupt is presented to the system. The do_IRQ() - routine determines the interrupt status and calls the device specific - interrupt handler according to the rules (flags) defined during I/O request - initiation with do_IO(). - -The next chapters describe the functions other than do_IRQ() in more details. -The do_IRQ() interface is not described, as it is called from the Linux/390 -first level interrupt handler only and does not comprise a device driver -callable interface. Instead, the functional description of do_IO() also -describes the input to the device specific interrupt handler. - -Note: All explanations apply also to the 64 bit architecture s390x. - - -Common Device Support (CDS) for Linux/390 Device Drivers - -General Information - -The following chapters describe the I/O related interface routines the -Linux/390 common device support (CDS) provides to allow for device specific -driver implementations on the IBM ESA/390 hardware platform. Those interfaces -intend to provide the functionality required by every device driver -implementation to allow to drive a specific hardware device on the ESA/390 -platform. Some of the interface routines are specific to Linux/390 and some -of them can be found on other Linux platforms implementations too. -Miscellaneous function prototypes, data declarations, and macro definitions -can be found in the architecture specific C header file -linux/arch/s390/include/asm/irq.h. - -Overview of CDS interface concepts - -Different to other hardware platforms, the ESA/390 architecture doesn't define -interrupt lines managed by a specific interrupt controller and bus systems -that may or may not allow for shared interrupts, DMA processing, etc.. Instead, -the ESA/390 architecture has implemented a so called channel subsystem, that -provides a unified view of the devices physically attached to the systems. -Though the ESA/390 hardware platform knows about a huge variety of different -peripheral attachments like disk devices (aka. DASDs), tapes, communication -controllers, etc. they can all be accessed by a well defined access method and -they are presenting I/O completion a unified way : I/O interruptions. Every -single device is uniquely identified to the system by a so called subchannel, -where the ESA/390 architecture allows for 64k devices be attached. - -Linux, however, was first built on the Intel PC architecture, with its two -cascaded 8259 programmable interrupt controllers (PICs), that allow for a -maximum of 15 different interrupt lines. All devices attached to such a system -share those 15 interrupt levels. Devices attached to the ISA bus system must -not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered -interrupts. MCA, EISA, PCI and other bus systems base on level triggered -interrupts, and therewith allow for shared IRQs. However, if multiple devices -present their hardware status by the same (shared) IRQ, the operating system -has to call every single device driver registered on this IRQ in order to -determine the device driver owning the device that raised the interrupt. - -Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel). -For internal use of the common I/O layer, these are still there. However, -device drivers should use the new calling interface via the ccw_device only. - -During its startup the Linux/390 system checks for peripheral devices. Each -of those devices is uniquely defined by a so called subchannel by the ESA/390 -channel subsystem. While the subchannel numbers are system generated, each -subchannel also takes a user defined attribute, the so called device number. -Both subchannel number and device number cannot exceed 65535. During sysfs -initialisation, the information about control unit type and device types that -imply specific I/O commands (channel command words - CCWs) in order to operate -the device are gathered. Device drivers can retrieve this set of hardware -information during their initialization step to recognize the devices they -support using the information saved in the struct ccw_device given to them. -This methods implies that Linux/390 doesn't require to probe for free (not -armed) interrupt request lines (IRQs) to drive its devices with. Where -applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS -ccw to retrieve device characteristics in its online routine. - -In order to allow for easy I/O initiation the CDS layer provides a -ccw_device_start() interface that takes a device specific channel program (one -or more CCWs) as input sets up the required architecture specific control blocks -and initiates an I/O request on behalf of the device driver. The -ccw_device_start() routine allows to specify whether it expects the CDS layer -to notify the device driver for every interrupt it observes, or with final status -only. See ccw_device_start() for more details. A device driver must never issue -ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead. - -For long running I/O request to be canceled, the CDS layer provides the -ccw_device_halt() function. Some devices require to initially issue a HALT -SUBCHANNEL (HSCH) command without having pending I/O requests. This function is -also covered by ccw_device_halt(). - - -get_ciw() - get command information word - -This call enables a device driver to get information about supported commands -from the extended SenseID data. - -struct ciw * -ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); - -cdev - The ccw_device for which the command is to be retrieved. -cmd - The command type to be retrieved. - -ccw_device_get_ciw() returns: -NULL - No extended data available, invalid device or command not found. -!NULL - The command requested. - - -ccw_device_start() - Initiate I/O Request - -The ccw_device_start() routines is the I/O request front-end processor. All -device driver I/O requests must be issued using this routine. A device driver -must not issue ESA/390 I/O commands itself. Instead the ccw_device_start() -routine provides all interfaces required to drive arbitrary devices. - -This description also covers the status information passed to the device -driver's interrupt handler as this is related to the rules (flags) defined -with the associated I/O request when calling ccw_device_start(). - -int ccw_device_start(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags); -int ccw_device_start_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags, - int expires); -int ccw_device_start_key(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags); -int ccw_device_start_key_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags, - int expires); - -cdev : ccw_device the I/O is destined for -cpa : logical start address of channel program -user_intparm : user specific interrupt information; will be presented - back to the device driver's interrupt handler. Allows a - device driver to associate the interrupt with a - particular I/O request. -lpm : defines the channel path to be used for a specific I/O - request. A value of 0 will make cio use the opm. -key : the storage key to use for the I/O (useful for operating on a - storage with a storage key != default key) -flag : defines the action to be performed for I/O processing -expires : timeout value in jiffies. The common I/O layer will terminate - the running program after this and call the interrupt handler - with ERR_PTR(-ETIMEDOUT) as irb. - -Possible flag values are : - -DOIO_ALLOW_SUSPEND - channel program may become suspended -DOIO_DENY_PREFETCH - don't allow for CCW prefetch; usually - this implies the channel program might - become modified -DOIO_SUPPRESS_INTER - don't call the handler on intermediate status - -The cpa parameter points to the first format 1 CCW of a channel program : - -struct ccw1 { - __u8 cmd_code;/* command code */ - __u8 flags; /* flags, like IDA addressing, etc. */ - __u16 count; /* byte count */ - __u32 cda; /* data address */ -} __attribute__ ((packed,aligned(8))); - -with the following CCW flags values defined : - -CCW_FLAG_DC - data chaining -CCW_FLAG_CC - command chaining -CCW_FLAG_SLI - suppress incorrect length -CCW_FLAG_SKIP - skip -CCW_FLAG_PCI - PCI -CCW_FLAG_IDA - indirect addressing -CCW_FLAG_SUSPEND - suspend - - -Via ccw_device_set_options(), the device driver may specify the following -options for the device: - -DOIO_EARLY_NOTIFICATION - allow for early interrupt notification -DOIO_REPORT_ALL - report all interrupt conditions - - -The ccw_device_start() function returns : - - 0 - successful completion or request successfully initiated --EBUSY - The device is currently processing a previous I/O request, or there is - a status pending at the device. --ENODEV - cdev is invalid, the device is not operational or the ccw_device is - not online. - -When the I/O request completes, the CDS first level interrupt handler will -accumulate the status in a struct irb and then call the device interrupt handler. -The intparm field will contain the value the device driver has associated with a -particular I/O request. If a pending device status was recognized, -intparm will be set to 0 (zero). This may happen during I/O initiation or delayed -by an alert status notification. In any case this status is not related to the -current (last) I/O request. In case of a delayed status notification no special -interrupt will be presented to indicate I/O completion as the I/O request was -never started, even though ccw_device_start() returned with successful completion. - -The irb may contain an error value, and the device driver should check for this -first: - --ETIMEDOUT: the common I/O layer terminated the request after the specified - timeout value --EIO: the common I/O layer terminated the request due to an error state - -If the concurrent sense flag in the extended status word (esw) in the irb is -set, the field erw.scnt in the esw describes the number of device specific -sense bytes available in the extended control word irb->scsw.ecw[]. No device -sensing by the device driver itself is required. - -The device interrupt handler can use the following definitions to investigate -the primary unit check source coded in sense byte 0 : - -SNS0_CMD_REJECT 0x80 -SNS0_INTERVENTION_REQ 0x40 -SNS0_BUS_OUT_CHECK 0x20 -SNS0_EQUIPMENT_CHECK 0x10 -SNS0_DATA_CHECK 0x08 -SNS0_OVERRUN 0x04 -SNS0_INCOMPL_DOMAIN 0x01 - -Depending on the device status, multiple of those values may be set together. -Please refer to the device specific documentation for details. - -The irb->scsw.cstat field provides the (accumulated) subchannel status : - -SCHN_STAT_PCI - program controlled interrupt -SCHN_STAT_INCORR_LEN - incorrect length -SCHN_STAT_PROG_CHECK - program check -SCHN_STAT_PROT_CHECK - protection check -SCHN_STAT_CHN_DATA_CHK - channel data check -SCHN_STAT_CHN_CTRL_CHK - channel control check -SCHN_STAT_INTF_CTRL_CHK - interface control check -SCHN_STAT_CHAIN_CHECK - chaining check - -The irb->scsw.dstat field provides the (accumulated) device status : - -DEV_STAT_ATTENTION - attention -DEV_STAT_STAT_MOD - status modifier -DEV_STAT_CU_END - control unit end -DEV_STAT_BUSY - busy -DEV_STAT_CHN_END - channel end -DEV_STAT_DEV_END - device end -DEV_STAT_UNIT_CHECK - unit check -DEV_STAT_UNIT_EXCEP - unit exception - -Please see the ESA/390 Principles of Operation manual for details on the -individual flag meanings. - -Usage Notes : - -ccw_device_start() must be called disabled and with the ccw device lock held. - -The device driver is allowed to issue the next ccw_device_start() call from -within its interrupt handler already. It is not required to schedule a -bottom-half, unless a non deterministically long running error recovery procedure -or similar needs to be scheduled. During I/O processing the Linux/390 generic -I/O device driver support has already obtained the IRQ lock, i.e. the handler -must not try to obtain it again when calling ccw_device_start() or we end in a -deadlock situation! - -If a device driver relies on an I/O request to be completed prior to start the -next it can reduce I/O processing overhead by chaining a NoOp I/O command -CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End -and Device-End status to be presented together, with a single interrupt. -However, this should be used with care as it implies the channel will remain -busy, not being able to process I/O requests for other devices on the same -channel. Therefore e.g. read commands should never use this technique, as the -result will be presented by a single interrupt anyway. - -In order to minimize I/O overhead, a device driver should use the -DOIO_REPORT_ALL only if the device can report intermediate interrupt -information prior to device-end the device driver urgently relies on. In this -case all I/O interruptions are presented to the device driver until final -status is recognized. - -If a device is able to recover from asynchronously presented I/O errors, it can -perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some -devices always report channel-end and device-end together, with a single -interrupt, others present primary status (channel-end) when the channel is -ready for the next I/O request and secondary status (device-end) when the data -transmission has been completed at the device. - -Above flag allows to exploit this feature, e.g. for communication devices that -can handle lost data on the network to allow for enhanced I/O processing. - -Unless the channel subsystem at any time presents a secondary status interrupt, -exploiting this feature will cause only primary status interrupts to be -presented to the device driver while overlapping I/O is performed. When a -secondary status without error (alert status) is presented, this indicates -successful completion for all overlapping ccw_device_start() requests that have -been issued since the last secondary (final) status. - -Channel programs that intend to set the suspend flag on a channel command word -(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the -suspend flag will cause a channel program check. At the time the channel program -becomes suspended an intermediate interrupt will be generated by the channel -subsystem. - -ccw_device_resume() - Resume Channel Program Execution - -If a device driver chooses to suspend the current channel program execution by -setting the CCW suspend flag on a particular CCW, the channel program execution -is suspended. In order to resume channel program execution the CIO layer -provides the ccw_device_resume() routine. - -int ccw_device_resume(struct ccw_device *cdev); - -cdev - ccw_device the resume operation is requested for - -The ccw_device_resume() function returns: - - 0 - suspended channel program is resumed --EBUSY - status pending --ENODEV - cdev invalid or not-operational subchannel --EINVAL - resume function not applicable --ENOTCONN - there is no I/O request pending for completion - -Usage Notes: -Please have a look at the ccw_device_start() usage notes for more details on -suspended channel programs. - -ccw_device_halt() - Halt I/O Request Processing - -Sometimes a device driver might need a possibility to stop the processing of -a long-running channel program or the device might require to initially issue -a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt() -command is provided. - -ccw_device_halt() must be called disabled and with the ccw device lock held. - -int ccw_device_halt(struct ccw_device *cdev, - unsigned long intparm); - -cdev : ccw_device the halt operation is requested for -intparm : interruption parameter; value is only used if no I/O - is outstanding, otherwise the intparm associated with - the I/O request is returned - -The ccw_device_halt() function returns : - - 0 - request successfully initiated --EBUSY - the device is currently busy, or status pending. --ENODEV - cdev invalid. --EINVAL - The device is not operational or the ccw device is not online. - -Usage Notes : - -A device driver may write a never-ending channel program by writing a channel -program that at its end loops back to its beginning by means of a transfer in -channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network -device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is -executed a program controlled interrupt (PCI) is generated. The device driver -can then perform an appropriate action. Prior to interrupt of an outstanding -read to a network device (with or without PCI flag) a ccw_device_halt() -is required to end the pending operation. - -ccw_device_clear() - Terminage I/O Request Processing - -In order to terminate all I/O processing at the subchannel, the clear subchannel -(CSCH) command is used. It can be issued via ccw_device_clear(). - -ccw_device_clear() must be called disabled and with the ccw device lock held. - -int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); - -cdev: ccw_device the clear operation is requested for -intparm: interruption parameter (see ccw_device_halt()) - -The ccw_device_clear() function returns: - - 0 - request successfully initiated --ENODEV - cdev invalid --EINVAL - The device is not operational or the ccw device is not online. - -Miscellaneous Support Routines - -This chapter describes various routines to be used in a Linux/390 device -driver programming environment. - -get_ccwdev_lock() - -Get the address of the device specific lock. This is then used in -spin_lock() / spin_unlock() calls. - - -__u8 ccw_device_get_path_mask(struct ccw_device *cdev); - -Get the mask of the path currently available for cdev. diff --git a/Documentation/s390/common_io.rst b/Documentation/s390/common_io.rst new file mode 100644 index 000000000000..846485681ce7 --- /dev/null +++ b/Documentation/s390/common_io.rst @@ -0,0 +1,140 @@ +====================== +S/390 common I/O-Layer +====================== + +command line parameters, procfs and debugfs entries +=================================================== + +Command line parameters +----------------------- + +* ccw_timeout_log + + Enable logging of debug information in case of ccw device timeouts. + +* cio_ignore = device[,device[,..]] + + device := {all | [!]ipldev | [!]condev | [!] | [!]-} + + The given devices will be ignored by the common I/O-layer; no detection + and device sensing will be done on any of those devices. The subchannel to + which the device in question is attached will be treated as if no device was + attached. + + An ignored device can be un-ignored later; see the "/proc entries"-section for + details. + + The devices must be given either as bus ids (0.x.abcd) or as hexadecimal + device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you + give a device number 0xabcd, it will be interpreted as 0.0.abcd. + + You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev' + keywords can be used to refer to the CCW based boot device and CCW console + device respectively (these are probably useful only when combined with the '!' + operator). The '!' operator will cause the I/O-layer to _not_ ignore a device. + The command line + is parsed from left to right. + + For example:: + + cio_ignore=0.0.0023-0.0.0042,0.0.4711 + + will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device + 0.0.4711, if detected. + + As another example:: + + cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02 + + will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02. + + By default, no devices are ignored. + + +/proc entries +------------- + +* /proc/cio_ignore + + Lists the ranges of devices (by bus id) which are ignored by common I/O. + + You can un-ignore certain or all devices by piping to /proc/cio_ignore. + "free all" will un-ignore all ignored devices, + "free , , ..." will un-ignore the specified + devices. + + For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored, + + - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore + will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023 + to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored; + - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device + 0.0.0041; + - echo free all > /proc/cio_ignore will un-ignore all remaining ignored + devices. + + When a device is un-ignored, device recognition and sensing is performed and + the device driver will be notified if possible, so the device will become + available to the system. Note that un-ignoring is performed asynchronously. + + You can also add ranges of devices to be ignored by piping to + /proc/cio_ignore; "add , , ..." will ignore the + specified devices. + + Note: While already known devices can be added to the list of devices to be + ignored, there will be no effect on then. However, if such a device + disappears and then reappears, it will then be ignored. To make + known devices go away, you need the "purge" command (see below). + + For example:: + + "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore" + + will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored + devices. + + You can remove already known but now ignored devices via:: + + "echo purge > /proc/cio_ignore" + + All devices ignored but still registered and not online (= not in use) + will be deregistered and thus removed from the system. + + The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward + compatibility, by the device number in hexadecimal (0xabcd or abcd). Device + numbers given as 0xabcd will be interpreted as 0.0.abcd. + +* /proc/cio_settle + + A write request to this file is blocked until all queued cio actions are + handled. This will allow userspace to wait for pending work affecting + device availability after changing cio_ignore or the hardware configuration. + +* For some of the information present in the /proc filesystem in 2.4 (namely, + /proc/subchannels and /proc/chpids), see driver-model.txt. + Information formerly in /proc/irq_count is now in /proc/interrupts. + + +debugfs entries +--------------- + +* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature) + + Some views generated by the debug feature to hold various debug outputs. + + - /sys/kernel/debug/s390dbf/cio_crw/sprintf + Messages from the processing of pending channel report words (machine check + handling). + + - /sys/kernel/debug/s390dbf/cio_msg/sprintf + Various debug messages from the common I/O-layer. + + - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii + Logs the calling of functions in the common I/O-layer and, if applicable, + which subchannel they were called for, as well as dumps of some data + structures (like irb in an error case). + + The level of logging can be changed to be more or less verbose by piping to + /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the + documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst) + for details. diff --git a/Documentation/s390/dasd.rst b/Documentation/s390/dasd.rst new file mode 100644 index 000000000000..9e22247285c8 --- /dev/null +++ b/Documentation/s390/dasd.rst @@ -0,0 +1,84 @@ +================== +DASD device driver +================== + +S/390's disk devices (DASDs) are managed by Linux via the DASD device +driver. It is valid for all types of DASDs and represents them to +Linux as block devices, namely "dd". Currently the DASD driver uses a +single major number (254) and 4 minor numbers per volume (1 for the +physical volume and 3 for partitions). With respect to partitions see +below. Thus you may have up to 64 DASD devices in your system. + +The kernel parameter 'dasd=from-to,...' may be issued arbitrary times +in the kernel's parameter line or not at all. The 'from' and 'to' +parameters are to be given in hexadecimal notation without a leading +0x. +If you supply kernel parameters the different instances are processed +in order of appearance and a minor number is reserved for any device +covered by the supplied range up to 64 volumes. Additional DASDs are +ignored. If you do not supply the 'dasd=' kernel parameter at all, the +DASD driver registers all supported DASDs of your system to a minor +number in ascending order of the subchannel number. + +The driver currently supports ECKD-devices and there are stubs for +support of the FBA and CKD architectures. For the FBA architecture +only some smart data structures are missing to make the support +complete. +We performed our testing on 3380 and 3390 type disks of different +sizes, under VM and on the bare hardware (LPAR), using internal disks +of the multiprise as well as a RAMAC virtual array. Disks exported by +an Enterprise Storage Server (Seascape) should work fine as well. + +We currently implement one partition per volume, which is the whole +volume, skipping the first blocks up to the volume label. These are +reserved for IPL records and IBM's volume label to assure +accessibility of the DASD from other OSs. In a later stage we will +provide support of partitions, maybe VTOC oriented or using a kind of +partition table in the label record. + +Usage +===== + +-Low-level format (?CKD only) +For using an ECKD-DASD as a Linux harddisk you have to low-level +format the tracks by issuing the BLKDASDFORMAT-ioctl on that +device. This will erase any data on that volume including IBM volume +labels, VTOCs etc. The ioctl may take a `struct format_data *` or +'NULL' as an argument:: + + typedef struct { + int start_unit; + int stop_unit; + int blksize; + } format_data_t; + +When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole +disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit +and stop_unit are the first and last track to be formatted. If +stop_unit is -1 it implies that the DASD is formatted from start_unit +up to the last track. blksize can be any power of two between 512 and +4096. We recommend no blksize lower than 1024 because the ext2fs uses +1kB blocks anyway and you gain approx. 50% of capacity increasing your +blksize from 512 byte to 1kB. + +Make a filesystem +================= + +Then you can mk??fs the filesystem of your choice on that volume or +partition. For reasons of sanity you should build your filesystem on +the partition /dev/dd?1 instead of the whole volume. You only lose 3kB +but may be sure that you can reuse your data after introduction of a +real partition table. + +Bugs +==== + +- Performance sometimes is rather low because we don't fully exploit clustering + +TODO-List +========= + +- Add IBM'S Disk layout to genhd +- Enhance driver to use more than one major number +- Enable usage as a module +- Support Cache fast write and DASD fast write (ECKD) diff --git a/Documentation/s390/debugging390.rst b/Documentation/s390/debugging390.rst new file mode 100644 index 000000000000..d49305fd5e1a --- /dev/null +++ b/Documentation/s390/debugging390.rst @@ -0,0 +1,2613 @@ +============================================= +Debugging on Linux for s/390 & z/Architecture +============================================= + +Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + +Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation + +.. Best viewed with fixed width fonts + +Overview of Document: +===================== +This document is intended to give a good overview of how to debug Linux for +s/390 and z/Architecture. It is not intended as a complete reference and not a +tutorial on the fundamentals of C & assembly. It doesn't go into +390 IO in any detail. It is intended to complement the documents in the +reference section below & any other worthwhile references you get. + +It is intended like the Enterprise Systems Architecture/390 Reference Summary +to be printed out & used as a quick cheat sheet self help style reference when +problems occur. + +.. Contents + ======== + Register Set + Address Spaces on Intel Linux + Address Spaces on Linux for s/390 & z/Architecture + The Linux for s/390 & z/Architecture Kernel Task Structure + Register Usage & Stackframes on Linux for s/390 & z/Architecture + A sample program with comments + Compiling programs for debugging on Linux for s/390 & z/Architecture + Debugging under VM + s/390 & z/Architecture IO Overview + Debugging IO on s/390 & z/Architecture under VM + GDB on s/390 & z/Architecture + Stack chaining in gdb by hand + Examining core dumps + ldd + Debugging modules + The proc file system + SysRq + References + Special Thanks + +Register Set +============ +The current architectures have the following registers. + +16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture, +r0-r15 (or gpr0-gpr15), used for arithmetic and addressing. + +16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15, +kernel usage only, used for memory management, interrupt control, debugging +control etc. + +16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture, +normally not used by normal programs but potentially could be used as +temporary storage. These registers have a 1:1 association with general +purpose registers and are designed to be used in the so-called access +register mode to select different address spaces. +Access register 0 (and access register 1 on z/Architecture, which needs a +64 bit pointer) is currently used by the pthread library as a pointer to +the current running threads private area. + +16 64-bit floating point registers (fp0-fp15 ) IEEE & HFP floating +point format compliant on G5 upwards & a Floating point control reg (FPC) + +4 64-bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines. + +Note: + Linux (currently) always uses IEEE & emulates G5 IEEE format on older + machines, ( provided the kernel is configured for this ). + + +The PSW is the most important register on the machine it +is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of +a program counter (pc), condition code register,memory space designator. +In IBM standard notation I am counting bit 0 as the MSB. +It has several advantages over a normal program counter +in that you can change address translation & program counter +in a single instruction. To change address translation, +e.g. switching address translation off requires that you +have a logical=physical mapping for the address you are +currently running at. + ++-------------------------+-------------------------------------------------+ +| Bit | | ++--------+----------------+ Value | +| s/390 | z/Architecture | | ++========+================+=================================================+ +| 0 | 0 | Reserved (must be 0) otherwise specification | +| | | exception occurs. | ++--------+----------------+-------------------------------------------------+ +| 1 | 1 | Program Event Recording 1 PER enabled, | +| | | PER is used to facilitate debugging e.g. | +| | | single stepping. | ++--------+----------------+-------------------------------------------------+ +| 2-4 | 2-4 | Reserved (must be 0). | ++--------+----------------+-------------------------------------------------+ +| 5 | 5 | Dynamic address translation 1=DAT on. | ++--------+----------------+-------------------------------------------------+ +| 6 | 6 | Input/Output interrupt Mask | ++--------+----------------+-------------------------------------------------+ +| 7 | 7 | External interrupt Mask used primarily for | +| | | interprocessor signalling and clock interrupts. | ++--------+----------------+-------------------------------------------------+ +| 8-11 | 8-11 | PSW Key used for complex memory protection | +| | | mechanism (not used under linux) | ++--------+----------------+-------------------------------------------------+ +| 12 | 12 | 1 on s/390 0 on z/Architecture | ++--------+----------------+-------------------------------------------------+ +| 13 | 13 | Machine Check Mask 1=enable machine check | +| | | interrupts | ++--------+----------------+-------------------------------------------------+ +| 14 | 14 | Wait State. Set this to 1 to stop the processor | +| | | except for interrupts and give time to other | +| | | LPARS. Used in CPU idle in the kernel to | +| | | increase overall usage of processor resources. | ++--------+----------------+-------------------------------------------------+ +| 15 | 15 | Problem state (if set to 1 certain instructions | +| | | are disabled). All linux user programs run with | +| | | this bit 1 (useful info for debugging under VM).| ++--------+----------------+-------------------------------------------------+ +| 16-17 | 16-17 | Address Space Control | +| | | | +| | | 00 Primary Space Mode: | +| | | | +| | | The register CR1 contains the primary | +| | | address-space control element (PASCE), which | +| | | points to the primary space region/segment | +| | | table origin. | +| | | | +| | | 01 Access register mode | +| | | | +| | | 10 Secondary Space Mode: | +| | | | +| | | The register CR7 contains the secondary | +| | | address-space control element (SASCE), which | +| | | points to the secondary space region or | +| | | segment table origin. | +| | | | +| | | 11 Home Space Mode: | +| | | | +| | | The register CR13 contains the home space | +| | | address-space control element (HASCE), which | +| | | points to the home space region/segment | +| | | table origin. | +| | | | +| | | See "Address Spaces on Linux for s/390 & | +| | | z/Architecture" below for more information | +| | | about address space usage in Linux. | ++--------+----------------+-------------------------------------------------+ +| 18-19 | 18-19 | Condition codes (CC) | ++--------+----------------+-------------------------------------------------+ +| 20 | 20 | Fixed point overflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 21 | 21 | Decimal overflow mask if 1=FPU exceptions for | +| | | this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 22 | 22 | Exponent underflow mask if 1=FPU exceptions | +| | | for this event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 23 | 23 | Significance Mask if 1=FPU exceptions for this | +| | | event occur (normally 0) | ++--------+----------------+-------------------------------------------------+ +| 24-31 | 24-30 | Reserved Must be 0. | +| +----------------+-------------------------------------------------+ +| | 31 | Extended Addressing Mode | +| +----------------+-------------------------------------------------+ +| | 32 | Basic Addressing Mode | +| | | | +| | | Used to set addressing mode | +| | | | +| | | +---------+----------+----------+ | +| | | | PSW 31 | PSW 32 | | | +| | | +---------+----------+----------+ | +| | | | 0 | 0 | 24 bit | | +| | | +---------+----------+----------+ | +| | | | 0 | 1 | 31 bit | | +| | | +---------+----------+----------+ | +| | | | 1 | 1 | 64 bit | | +| | | +---------+----------+----------+ | ++--------+----------------+-------------------------------------------------+ +| 32 | | 1=31 bit addressing mode 0=24 bit addressing | +| | | mode (for backward compatibility), linux | +| | | always runs with this bit set to 1 | ++--------+----------------+-------------------------------------------------+ +| 33-64 | | Instruction address. | +| +----------------+-------------------------------------------------+ +| | 33-63 | Reserved must be 0 | +| +----------------+-------------------------------------------------+ +| | 64-127 | Address | +| | | | +| | | - In 24 bits mode bits 64-103=0 bits 104-127 | +| | | Address | +| | | - In 31 bits mode bits 64-96=0 bits 97-127 | +| | | Address | +| | | | +| | | Note: | +| | | unlike 31 bit mode on s/390 bit 96 must be | +| | | zero when loading the address with LPSWE | +| | | otherwise a specification exception occurs, | +| | | LPSW is fully backward compatible. | ++--------+----------------+-------------------------------------------------+ + +Prefix Page(s) +-------------- +This per cpu memory area is too intimately tied to the processor not to mention. +It exists between the real addresses 0-4096 on s/390 and between 0-8192 on +z/Architecture and is exchanged with one page on s/390 or two pages on +z/Architecture in absolute storage by the set prefix instruction during Linux +startup. + +This page is mapped to a different prefix for each processor in an SMP +configuration (assuming the OS designer is sane of course). + +Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on +z/Architecture are used by the processor itself for holding such information +as exception indications and entry points for exceptions. + +Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and +z/Architecture (there is a gap on z/Architecture currently between 0xc00 and +0x1000, too, which is used by Linux). + +The closest thing to this on traditional architectures is the interrupt +vector table. This is a good thing & does simplify some of the kernel coding +however it means that we now cannot catch stray NULL pointers in the +kernel without hard coded checks. + + + +Address Spaces on Intel Linux +============================= + +The traditional Intel Linux is approximately mapped as follows forgive +the ascii art:: + + 0xFFFFFFFF 4GB Himem ***************** + * * + * Kernel Space * + * * + ***************** **************** + User Space Himem * User Stack * * * + (typically 0xC0000000 3GB ) ***************** * * + * Shared Libs * * Next Process * + ***************** * to * + * * <== * Run * <== + * User Program * * * + * Data BSS * * * + * Text * * * + * Sections * * * + 0x00000000 ***************** **************** + +Now it is easy to see that on Intel it is quite easy to recognise a kernel +address as being one greater than user space himem (in this case 0xC0000000), +and addresses of less than this are the ones in the current running program on +this processor (if an smp box). + +If using the virtual machine ( VM ) as a debugger it is quite difficult to +know which user process is running as the address space you are looking at +could be from any process in the run queue. + +The limitation of Intels addressing technique is that the linux +kernel uses a very simple real address to virtual addressing technique +of Real Address=Virtual Address-User Space Himem. +This means that on Intel the kernel linux can typically only address +Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines +can typically use. + +They can lower User Himem to 2GB or lower & thus be +able to use 2GB of RAM however this shrinks the maximum size +of User Space from 3GB to 2GB they have a no win limit of 4GB unless +they go to 64 Bit. + + +On 390 our limitations & strengths make us slightly different. +For backward compatibility we are only allowed use 31 bits (2GB) +of our 32 bit addresses, however, we use entirely separate address +spaces for the user & kernel. + +This means we can support 2GB of non Extended RAM on s/390, & more +with the Extended memory management swap device & +currently 4TB of physical memory currently on z/Architecture. + + +Address Spaces on Linux for s/390 & z/Architecture +================================================== + +Our addressing scheme is basically as follows:: + + Primary Space Home Space + Himem 0x7fffffff 2GB on s/390 ***************** **************** + currently 0x3ffffffffff (2^42)-1 * User Stack * * * + on z/Architecture. ***************** * * + * Shared Libs * * * + ***************** * * + * * * Kernel * + * User Program * * * + * Data BSS * * * + * Text * * * + * Sections * * * + 0x00000000 ***************** **************** + +This also means that we need to look at the PSW problem state bit and the +addressing mode to decide whether we are looking at user or kernel space. + +User space runs in primary address mode (or access register mode within +the vdso code). + +The kernel usually also runs in home space mode, however when accessing +user space the kernel switches to primary or secondary address mode if +the mvcos instruction is not available or if a compare-and-swap (futex) +instruction on a user space address is performed. + +When also looking at the ASCE control registers, this means: + +User space: + +- runs in primary or access register mode +- cr1 contains the user asce +- cr7 contains the user asce +- cr13 contains the kernel asce + +Kernel space: + +- runs in home space mode +- cr1 contains the user or kernel asce + + - the kernel asce is loaded when a uaccess requires primary or + secondary address mode + +- cr7 contains the user or kernel asce, (changed with set_fs()) +- cr13 contains the kernel asce + +In case of uaccess the kernel changes to: + +- primary space mode in case of a uaccess (copy_to_user) and uses + e.g. the mvcp instruction to access user space. However the kernel + will stay in home space mode if the mvcos instruction is available +- secondary space mode in case of futex atomic operations, so that the + instructions come from primary address space and data from secondary + space + +In case of KVM, the kernel runs in home space mode, but cr1 gets switched +to contain the gmap asce before the SIE instruction gets executed. When +the SIE instruction is finished, cr1 will be switched back to contain the +user asce. + + +Virtual Addresses on s/390 & z/Architecture +=========================================== + +A virtual address on s/390 is made up of 3 parts +The SX (segment index, roughly corresponding to the PGD & PMD in Linux +terminology) being bits 1-11. + +The PX (page index, corresponding to the page table entry (pte) in Linux +terminology) being bits 12-19. + +The remaining bits BX (the byte index are the offset in the page ) +i.e. bits 20 to 31. + +On z/Architecture in linux we currently make up an address from 4 parts. + +- The region index bits (RX) 0-32 we currently use bits 22-32 +- The segment index (SX) being bits 33-43 +- The page index (PX) being bits 44-51 +- The byte index (BX) being bits 52-63 + +Notes: + 1) s/390 has no PMD so the PMD is really the PGD also. + A lot of this stuff is defined in pgtable.h. + + 2) Also seeing as s/390's page indexes are only 1k in size + (bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k ) + to make the best use of memory by updating 4 segment indices + entries each time we mess with a PMD & use offsets + 0,1024,2048 & 3072 in this page as for our segment indexes. + On z/Architecture our page indexes are now 2k in size + ( bits 12-19 x 8 bytes per pte ) we do a similar trick + but only mess with 2 segment indices each time we mess with + a PMD. + + 3) As z/Architecture supports up to a massive 5-level page table lookup we + can only use 3 currently on Linux ( as this is all the generic kernel + currently supports ) however this may change in future + this allows us to access ( according to my sums ) + 4TB of virtual storage per process i.e. + 4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes, + enough for another 2 or 3 of years I think :-). + to do this we use a region-third-table designation type in + our address space control registers. + + +The Linux for s/390 & z/Architecture Kernel Task Structure +========================================================== +Each process/thread under Linux for S390 has its own kernel task_struct +defined in linux/include/linux/sched.h +The S390 on initialisation & resuming of a process on a cpu sets +the __LC_KERNEL_STACK variable in the spare prefix area for this cpu +(which we use for per-processor globals). + +The kernel stack pointer is intimately tied with the task structure for +each processor as follows:: + + s/390 + ************************ + * 1 page kernel stack * + * ( 4K ) * + ************************ + * 1 page task_struct * + * ( 4K ) * + 8K aligned ************************ + + z/Architecture + ************************ + * 2 page kernel stack * + * ( 8K ) * + ************************ + * 2 page task_struct * + * ( 8K ) * + 16K aligned ************************ + +What this means is that we don't need to dedicate any register or global +variable to point to the current running process & can retrieve it with the +following very simple construct for s/390 & one very similar for +z/Architecture:: + + static inline struct task_struct * get_current(void) + { + struct task_struct *current; + __asm__("lhi %0,-8192\n\t" + "nr %0,15" + : "=r" (current) ); + return current; + } + +i.e. just anding the current kernel stack pointer with the mask -8192. +Thankfully because Linux doesn't have support for nested IO interrupts +& our devices have large buffers can survive interrupts being shut for +short amounts of time we don't need a separate stack for interrupts. + + + + +Register Usage & Stackframes on Linux for s/390 & z/Architecture +================================================================= +Overview: +--------- +This is the code that gcc produces at the top & the bottom of +each function. It usually is fairly consistent & similar from +function to function & if you know its layout you can probably +make some headway in finding the ultimate cause of a problem +after a crash without a source level debugger. + +Note: To follow stackframes requires a knowledge of C or Pascal & +limited knowledge of one assembly language. + +It should be noted that there are some differences between the +s/390 and z/Architecture stack layouts as the z/Architecture stack layout +didn't have to maintain compatibility with older linkage formats. + +Glossary: +--------- +alloca: + This is a built in compiler function for runtime allocation + of extra space on the callers stack which is obviously freed + up on function exit ( e.g. the caller may choose to allocate nothing + of a buffer of 4k if required for temporary purposes ), it generates + very efficient code ( a few cycles ) when compared to alternatives + like malloc. + +automatics: + These are local variables on the stack, i.e they aren't in registers & + they aren't static. + +back-chain: + This is a pointer to the stack pointer before entering a + framed functions ( see frameless function ) prologue got by + dereferencing the address of the current stack pointer, + i.e. got by accessing the 32 bit value at the stack pointers + current location. + +base-pointer: + This is a pointer to the back of the literal pool which + is an area just behind each procedure used to store constants + in each function. + +call-clobbered: + The caller probably needs to save these registers if there + is something of value in them, on the stack or elsewhere before making a + call to another procedure so that it can restore it later. + +epilogue: + The code generated by the compiler to return to the caller. + +frameless-function: + A frameless function in Linux for s390 & z/Architecture is one which doesn't + need more than the register save area (96 bytes on s/390, 160 on z/Architecture) + given to it by the caller. + + A frameless function never: + + 1) Sets up a back chain. + 2) Calls alloca. + 3) Calls other normal functions + 4) Has automatics. + +GOT-pointer: + This is a pointer to the global-offset-table in ELF + ( Executable Linkable Format, Linux'es most common executable format ), + all globals & shared library objects are found using this pointer. + +lazy-binding + ELF shared libraries are typically only loaded when routines in the shared + library are actually first called at runtime. This is lazy binding. + +procedure-linkage-table + This is a table found from the GOT which contains pointers to routines + in other shared libraries which can't be called to by easier means. + +prologue: + The code generated by the compiler to set up the stack frame. + +outgoing-args: + This is extra area allocated on the stack of the calling function if the + parameters for the callee's cannot all be put in registers, the same + area can be reused by each function the caller calls. + +routine-descriptor: + A COFF executable format based concept of a procedure reference + actually being 8 bytes or more as opposed to a simple pointer to the routine. + This is typically defined as follows: + + - Routine Descriptor offset 0=Pointer to Function + - Routine Descriptor offset 4=Pointer to Table of Contents + + The table of contents/TOC is roughly equivalent to a GOT pointer. + & it means that shared libraries etc. can be shared between several + environments each with their own TOC. + +static-chain: + This is used in nested functions a concept adopted from pascal + by gcc not used in ansi C or C++ ( although quite useful ), basically it + is a pointer used to reference local variables of enclosing functions. + You might come across this stuff once or twice in your lifetime. + + e.g. + + The function below should return 11 though gcc may get upset & toss warnings + about unused variables:: + + int FunctionA(int a) + { + int b; + FunctionC(int c) + { + b=c+1; + } + FunctionC(10); + return(b); + } + + +s/390 & z/Architecture Register usage +===================================== + +======== ========================================== =============== +r0 used by syscalls/assembly call-clobbered +r1 used by syscalls/assembly call-clobbered +r2 argument 0 / return value 0 call-clobbered +r3 argument 1 / return value 1 (if long long) call-clobbered +r4 argument 2 call-clobbered +r5 argument 3 call-clobbered +r6 argument 4 saved +r7 pointer-to arguments 5 to ... saved +r8 this & that saved +r9 this & that saved +r10 static-chain ( if nested function ) saved +r11 frame-pointer ( if function used alloca ) saved +r12 got-pointer saved +r13 base-pointer saved +r14 return-address saved +r15 stack-pointer saved + +f0 argument 0 / return value ( float/double ) call-clobbered +f2 argument 1 call-clobbered +f4 z/Architecture argument 2 saved +f6 z/Architecture argument 3 saved +======== ========================================== =============== + +The remaining floating points +f1,f3,f5 f7-f15 are call-clobbered. + +Notes: +------ +1) The only requirement is that registers which are used + by the callee are saved, e.g. the compiler is perfectly + capable of using r11 for purposes other than a frame a + frame pointer if a frame pointer is not needed. +2) In functions with variable arguments e.g. printf the calling procedure + is identical to one without variable arguments & the same number of + parameters. However, the prologue of this function is somewhat more + hairy owing to it having to move these parameters to the stack to + get va_start, va_arg & va_end to work. +3) Access registers are currently unused by gcc but are used in + the kernel. Possibilities exist to use them at the moment for + temporary storage but it isn't recommended. +4) Only 4 of the floating point registers are used for + parameter passing as older machines such as G3 only have only 4 + & it keeps the stack frame compatible with other compilers. + However with IEEE floating point emulation under linux on the + older machines you are free to use the other 12. +5) A long long or double parameter cannot be have the + first 4 bytes in a register & the second four bytes in the + outgoing args area. It must be purely in the outgoing args + area if crossing this boundary. +6) Floating point parameters are mixed with outgoing args + on the outgoing args area in the order the are passed in as parameters. +7) Floating point arguments 2 & 3 are saved in the outgoing args area for + z/Architecture + + +Stack Frame Layout +------------------ + +========= ============== ====================================================== +s/390 z/Architecture +========= ============== ====================================================== +0 0 back chain ( a 0 here signifies end of back chain ) +4 8 eos ( end of stack, not used on Linux for S390 used + in other linkage formats ) +8 16 glue used in other s/390 linkage formats for saved + routine descriptors etc. +12 24 glue used in other s/390 linkage formats for saved + routine descriptors etc. +16 32 scratch area +20 40 scratch area +24 48 saved r6 of caller function +28 56 saved r7 of caller function +32 64 saved r8 of caller function +36 72 saved r9 of caller function +40 80 saved r10 of caller function +44 88 saved r11 of caller function +48 96 saved r12 of caller function +52 104 saved r13 of caller function +56 112 saved r14 of caller function +60 120 saved r15 of caller function +64 128 saved f4 of caller function +72 132 saved f6 of caller function +80 undefined +96 160 outgoing args passed from caller to callee +96+x 160+x possible stack alignment ( 8 bytes desirable ) +96+x+y 160+x+y alloca space of caller ( if used ) +96+x+y+z 160+x+y+z automatics of caller ( if used ) +0 back-chain +========= ============== ====================================================== + +A sample program with comments. +=============================== + +Comments on the function test +----------------------------- +1) It didn't need to set up a pointer to the constant pool gpr13 as it is not + used ( :-( ). +2) This is a frameless function & no stack is bought. +3) The compiler was clever enough to recognise that it could return the + value in r2 as well as use it for the passed in parameter ( :-) ). +4) The basr ( branch relative & save ) trick works as follows the instruction + has a special case with r0,r0 with some instruction operands is understood as + the literal value 0, some risc architectures also do this ). So now + we are branching to the next address & the address new program counter is + in r13,so now we subtract the size of the function prologue we have executed + the size of the literal pool to get to the top of the literal pool:: + + + 0040037c int test(int b) + { # Function prologue below + 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14 + 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using + 400382: a7 da ff fa ahi %r13,-6 # basr trick + return(5+b); + # Huge main program + 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2 + + # Function epilogue below + 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14 + 40038e: 07 fe br %r14 # return + } + +Comments on the function main +----------------------------- +1) The compiler did this function optimally ( 8-) ):: + + Literal pool for main. + 400390: ff ff ff ec .long 0xffffffec + main(int argc,char *argv[]) + { # Function prologue below + 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers + 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0 + 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving + 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to + 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool + 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain + + return(test(5)); # Main Program Below + 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from + # literal pool + 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5 + 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return + # address using branch & save instruction. + + # Function Epilogue below + 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers. + 4003b8: 07 fe br %r14 # return to do program exit + } + + +Compiler updates +---------------- + +:: + + main(int argc,char *argv[]) + { + 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15) + 400500: a7 d5 00 04 bras %r13,400508 + 400504: 00 40 04 f4 .long 0x004004f4 + # compiler now puts constant pool in code to so it saves an instruction + 400508: 18 0f lr %r0,%r15 + 40050a: a7 fa ff a0 ahi %r15,-96 + 40050e: 50 00 f0 00 st %r0,0(%r15) + return(test(5)); + 400512: 58 10 d0 00 l %r1,0(%r13) + 400516: a7 28 00 05 lhi %r2,5 + 40051a: 0d e1 basr %r14,%r1 + # compiler adds 1 extra instruction to epilogue this is done to + # avoid processor pipeline stalls owing to data dependencies on g5 & + # above as register 14 in the old code was needed directly after being loaded + # by the lm %r11,%r15,140(%r15) for the br %14. + 40051c: 58 40 f0 98 l %r4,152(%r15) + 400520: 98 7f f0 7c lm %r7,%r15,124(%r15) + 400524: 07 f4 br %r4 + } + + +Hartmut ( our compiler developer ) also has been threatening to take out the +stack backchain in optimised code as this also causes pipeline stalls, you +have been warned. + +64 bit z/Architecture code disassembly +-------------------------------------- + +If you understand the stuff above you'll understand the stuff +below too so I'll avoid repeating myself & just say that +some of the instructions have g's on the end of them to indicate +they are 64 bit & the stack offsets are a bigger, +the only other difference you'll find between 32 & 64 bit is that +we now use f4 & f6 for floating point arguments on 64 bit:: + + 00000000800005b0 : + int test(int b) + { + return(5+b); + 800005b0: a7 2a 00 05 ahi %r2,5 + 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer + 800005b8: 07 fe br %r14 + 800005ba: 07 07 bcr 0,%r7 + + + } + + 00000000800005bc
: + main(int argc,char *argv[]) + { + 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15) + 800005c2: b9 04 00 1f lgr %r1,%r15 + 800005c6: a7 fb ff 60 aghi %r15,-160 + 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15) + return(test(5)); + 800005d0: a7 29 00 05 lghi %r2,5 + # brasl allows jumps > 64k & is overkill here bras would do fune + 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 + 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15) + 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15) + 800005e6: 07 f4 br %r4 + } + + + +Compiling programs for debugging on Linux for s/390 & z/Architecture +==================================================================== +-gdwarf-2 now works it should be considered the default debugging +format for s/390 & z/Architecture as it is more reliable for debugging +shared libraries, normal -g debugging works much better now +Thanks to the IBM java compiler developers bug reports. + +This is typically done adding/appending the flags -g or -gdwarf-2 to the +CFLAGS & LDFLAGS variables Makefile of the program concerned. + +If using gdb & you would like accurate displays of registers & +stack traces compile without optimisation i.e make sure +that there is no -O2 or similar on the CFLAGS line of the Makefile & +the emitted gcc commands, obviously this will produce worse code +( not advisable for shipment ) but it is an aid to the debugging process. + +This aids debugging because the compiler will copy parameters passed in +in registers onto the stack so backtracing & looking at passed in +parameters will work, however some larger programs which use inline functions +will not compile without optimisation. + +Debugging with optimisation has since much improved after fixing +some bugs, please make sure you are using gdb-5.0 or later developed +after Nov'2000. + + + +Debugging under VM +================== + +Notes +----- +Addresses & values in the VM debugger are always hex never decimal +Address ranges are of the format - or +. +For example, the address range 0x2000 to 0x3000 can be described as 2000-3000 +or 2000.1000 + +The VM Debugger is case insensitive. + +VM's strengths are usually other debuggers weaknesses you can get at any +resource no matter how sensitive e.g. memory management resources, change +address translation in the PSW. For kernel hacking you will reap dividends if +you get good at it. + +The VM Debugger displays operators but not operands, and also the debugger +displays useful information on the same line as the author of the code probably +felt that it was a good idea not to go over the 80 columns on the screen. +This isn't as unintuitive as it may seem as the s/390 instructions are easy to +decode mentally and you can make a good guess at a lot of them as all the +operands are nibble (half byte aligned). +So if you have an objdump listing by hand, it is quite easy to follow, and if +you don't have an objdump listing keep a copy of the s/390 Reference Summary +or alternatively the s/390 principles of operation next to you. +e.g. even I can guess that +0001AFF8' LR 180F CC 0 +is a ( load register ) lr r0,r15 + +Also it is very easy to tell the length of a 390 instruction from the 2 most +significant bits in the instruction (not that this info is really useful except +if you are trying to make sense of a hexdump of code). +Here is a table + +======================= ================== +Bits Instruction Length +======================= ================== +00 2 Bytes +01 4 Bytes +10 4 Bytes +11 6 Bytes +======================= ================== + +The debugger also displays other useful info on the same line such as the +addresses being operated on destination addresses of branches & condition codes. +e.g.:: + + 00019736' AHI A7DAFF0E CC 1 + 000198BA' BRC A7840004 -> 000198C2' CC 0 + 000198CE' STM 900EF068 >> 0FA95E78 CC 2 + + + +Useful VM debugger commands +--------------------------- + +I suppose I'd better mention this before I start +to list the current active traces do:: + + Q TR + +there can be a maximum of 255 of these per set +( more about trace sets later ). + +To stop traces issue a:: + + TR END. + +To delete a particular breakpoint issue:: + + TR DEL + +The PA1 key drops to CP mode so you can issue debugger commands, +Doing alt c (on my 3270 console at least ) clears the screen. + +hitting b comes back to the running operating system +from cp mode ( in our case linux ). + +It is typically useful to add shortcuts to your profile.exec file +if you have one ( this is roughly equivalent to autoexec.bat in DOS ). +file here are a few from mine:: + + /* this gives me command history on issuing f12 */ + set pf12 retrieve + /* this continues */ + set pf8 imm b + /* goes to trace set a */ + set pf1 imm tr goto a + /* goes to trace set b */ + set pf2 imm tr goto b + /* goes to trace set c */ + set pf3 imm tr goto c + + + +Instruction Tracing +------------------- +Setting a simple breakpoint:: + + TR I PSWA
+ +To debug a particular function try:: + + TR I R + TR I on its own will single step. + TR I DATA will trace for particular mnemonics + +e.g.:: + + TR I DATA 4D R 0197BC.4000 + +will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000 + +if you were inclined you could add traces for all branch instructions & +suffix them with the run prefix so you would have a backtrace on screen +when a program crashes:: + + TR BR will trace branches into or out of an address. + +e.g.:: + + TR BR INTO 0 + +is often quite useful if a program is getting awkward & deciding +to branch to 0 & crashing as this will stop at the address before in jumps to 0. + +:: + + TR I R
RUN cmd d g + +single steps a range of addresses but stays running & +displays the gprs on each step. + + + +Displaying & modifying Registers +-------------------------------- +D G + will display all the gprs + +Adding a extra G to all the commands is necessary to access the full 64 bit +content in VM on z/Architecture. Obviously this isn't required for access +registers as these are still 32 bit. + +e.g. + +DGG + instead of DG + +D X + will display all the control registers +D AR + will display all the access registers +D AR4-7 + will display access registers 4 to 7 +CPU ALL D G + will display the GRPS of all CPUS in the configuration +D PSW + will display the current PSW +st PSW 2000 + will put the value 2000 into the PSW & cause crash your machine. +D PREFIX + displays the prefix offset + + +Displaying Memory +----------------- +To display memory mapped using the current PSW's mapping try:: + + D + +To make VM display a message each time it hits a particular address and +continue try: + +D I + will disassemble/display a range of instructions. + +ST addr 32 bit word + will store a 32 bit aligned address +D T + will display the EBCDIC in an address (if you are that way inclined) +D R + will display real addresses ( without DAT ) but with prefixing. + +There are other complex options to display if you need to get at say home space +but are in primary space the easiest thing to do is to temporarily +modify the PSW to the other addressing mode, display the stuff & then +restore it. + + + +Hints +----- +If you want to issue a debugger command without halting your virtual machine +with the PA1 key try prefixing the command with #CP e.g.:: + + #cp tr i pswa 2000 + +also suffixing most debugger commands with RUN will cause them not +to stop just display the mnemonic at the current instruction on the console. + +If you have several breakpoints you want to put into your program & +you get fed up of cross referencing with System.map +you can do the following trick for several symbols. + +:: + + grep do_signal System.map + +which emits the following among other things:: + + 0001f4e0 T do_signal + +now you can do:: + + TR I PSWA 0001f4e0 cmd msg * do_signal + +This sends a message to your own console each time do_signal is entered. +( As an aside I wrote a perl script once which automatically generated a REXX +script with breakpoints on every kernel procedure, this isn't a good idea +because there are thousands of these routines & VM can only set 255 breakpoints +at a time so you nearly had to spend as long pruning the file down as you would +entering the msgs by hand), however, the trick might be useful for a single +object file. In the 3270 terminal emulator x3270 there is a very useful option +in the file menu called "Save Screen In File" - this is very good for keeping a +copy of traces. + +From CMS help will give you online help on a particular command. +e.g.:: + + HELP DISPLAY + +Also CP has a file called profile.exec which automatically gets called +on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session +CP has a feature similar to doskey, it may be useful for you to +use profile.exec to define some keystrokes. + +SET PF9 IMM B + This does a single step in VM on pressing F8. + +SET PF10 ^ + This sets up the ^ key. + which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed + directly into some 3270 consoles. + +SET PF11 ^- + This types the starting keystrokes for a sysrq see SysRq below. +SET PF12 RETRIEVE + This retrieves command history on pressing F12. + + +Sometimes in VM the display is set up to scroll automatically this +can be very annoying if there are messages you wish to look at +to stop this do + +TERM MORE 255 255 + This will nearly stop automatic screen updates, however it will + cause a denial of service if lots of messages go to the 3270 console, + so it would be foolish to use this as the default on a production machine. + + +Tracing particular processes +---------------------------- +The kernel's text segment is intentionally at an address in memory that it will +very seldom collide with text segments of user programs ( thanks Martin ), +this simplifies debugging the kernel. +However it is quite common for user processes to have addresses which collide +this can make debugging a particular process under VM painful under normal +circumstances as the process may change when doing a:: + + TR I R
. + +Thankfully after reading VM's online help I figured out how to debug +I particular process. + +Your first problem is to find the STD ( segment table designation ) +of the program you wish to debug. +There are several ways you can do this here are a few + +Run:: + + objdump --syms | grep main + +To get the address of main in the program. Then:: + + tr i pswa
+ +Start the program, if VM drops to CP on what looks like the entry +point of the main function this is most likely the process you wish to debug. +Now do a D X13 or D XG13 on z/Architecture. + +On 31 bit the STD is bits 1-19 ( the STO segment table origin ) +& 25-31 ( the STL segment table length ) of CR13. + +now type:: + + TR I R STD 0.7fffffff + +e.g.:: + + TR I R STD 8F32E1FF 0.7fffffff + +Another very useful variation is:: + + TR STORE INTO STD
+ +for finding out when a particular variable changes. + +An alternative way of finding the STD of a currently running process +is to do the following, ( this method is more complex but +could be quite convenient if you aren't updating the kernel much & +so your kernel structures will stay constant for a reasonable period of +time ). + +:: + + grep task /proc//status + +from this you should see something like:: + + task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68 + +This now gives you a pointer to the task structure. + +Now make:: + + CC:="s390-gcc -g" kernel/sched.s + +To get the task_struct stabinfo. + +( task_struct is defined in include/linux/sched.h ). + +Now we want to look at +task->active_mm->pgd + +on my machine the active_mm in the task structure stab is +active_mm:(4,12),672,32 + +its offset is 672/8=84=0x54 + +the pgd member in the mm_struct stab is +pgd:(4,6)=*(29,5),96,32 +so its offset is 96/8=12=0xc + +so we'll:: + + hexdump -s 0xf160054 /dev/mem | more + +i.e. task_struct+active_mm offset +to look at the active_mm member:: + + f160054 0fee cc60 0019 e334 0000 0000 0000 0011 + +:: + + hexdump -s 0x0feecc6c /dev/mem | more + +i.e. active_mm+pgd offset:: + + feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010 + +we get something like +now do:: + + TR I R STD 0.7fffffff + +i.e. the 0x7f is added because the pgd only +gives the page table origin & we need to set the low bits +to the maximum possible segment table length. + +:: + + TR I R STD 0f2c007f 0.7fffffff + +on z/Architecture you'll probably need to do:: + + TR I R STD 0.ffffffffffffffff + +to set the TableType to 0x1 & the Table length to 3. + + + +Tracing Program Exceptions +-------------------------- +If you get a crash which says something like +illegal operation or specification exception followed by a register dump +You can restart linux & trace these using the tr prog trace +option. + + +The most common ones you will normally be tracing for is: + +- 1=operation exception +- 2=privileged operation exception +- 4=protection exception +- 5=addressing exception +- 6=specification exception +- 10=segment translation exception +- 11=page translation exception + +The full list of these is on page 22 of the current s/390 Reference Summary. +e.g. + +tr prog 10 will trace segment translation exceptions. + +tr prog on its own will trace all program interruption codes. + +Trace Sets +---------- +On starting VM you are initially in the INITIAL trace set. +You can do a Q TR to verify this. +If you have a complex tracing situation where you wish to wait for instance +till a driver is open before you start tracing IO, but know in your +heart that you are going to have to make several runs through the code till you +have a clue whats going on. + +What you can do is:: + + TR I PSWA + +hit b to continue till breakpoint + +reach the breakpoint + +now do your:: + + TR GOTO B + TR IO 7c08-7c09 inst int run + +or whatever the IO channels you wish to trace are & hit b + +To got back to the initial trace set do:: + + TR GOTO INITIAL + +& the TR I PSWA will be the only active breakpoint again. + + +Tracing linux syscalls under VM +------------------------------- +Syscalls are implemented on Linux for S390 by the Supervisor call instruction +(SVC). There 256 possibilities of these as the instruction is made up of a 0xA +opcode and the second byte being the syscall number. They are traced using the +simple command:: + + TR SVC + +the syscalls are defined in linux/arch/s390/include/asm/unistd.h +e.g. to trace all file opens just do:: + + TR SVC 5 ( as this is the syscall number of open ) + + +SMP Specific commands +--------------------- +To find out how many cpus you have +Q CPUS displays all the CPU's available to your virtual machine +To find the cpu that the current cpu VM debugger commands are being directed at +do Q CPU to change the current cpu VM debugger commands are being directed at +do:: + + CPU + +On a SMP guest issue a command to all CPUs try prefixing the command with cpu +all. To issue a command to a particular cpu try cpu e.g.:: + + CPU 01 TR I R 2000.3000 + +If you are running on a guest with several cpus & you have a IO related problem +& cannot follow the flow of code but you know it isn't smp related. + +from the bash prompt issue:: + + shutdown -h now or halt. + +do a:: + + Q CPUS + +to find out how many cpus you have detach each one of them from cp except +cpu 0 by issuing a:: + + DETACH CPU 01-(number of cpus in configuration) + +& boot linux again. + +TR SIGP + will trace inter processor signal processor instructions. + +DEFINE CPU 01-(number in configuration) + will get your guests cpus back. + + +Help for displaying ascii textstrings +------------------------------------- +On the very latest VM Nucleus'es VM can now display ascii +( thanks Neale for the hint ) by doing:: + + D TX. + +e.g.:: + + D TX0.100 + +Alternatively +============= +Under older VM debuggers (I love EBDIC too) you can use following little +program which converts a command line of hex digits to ascii text. It can be +compiled under linux and you can copy the hex digits from your x3270 terminal +to your xterm if you are debugging from a linuxbox. + +This is quite useful when looking at a parameter passed in as a text string +under VM ( unless you are good at decoding ASCII in your head ). + +e.g. consider tracing an open syscall:: + + TR SVC 5 + +We have stopped at a breakpoint:: + + 000151B0' SVC 0A05 -> 0001909A' CC 0 + +D 20.8 to check the SVC old psw in the prefix area and see was it from userspace +(for the layout of the prefix area consult the "Fixed Storage Locations" +chapter of the s/390 Reference Summary if you have it available). + +:: + + V00000020 070C2000 800151B2 + +The problem state bit wasn't set & it's also too early in the boot sequence +for it to be a userspace SVC if it was we would have to temporarily switch the +psw to user space addressing so we could get at the first parameter of the open +in gpr2. + +Next do a:: + + D G2 + GPR 2 = 00014CB4 + +Now display what gpr2 is pointing to:: + + D 00014CB4.20 + V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5 + V00014CC4 FC00014C B4001001 E0001000 B8070707 + +Now copy the text till the first 00 hex ( which is the end of the string +to an xterm & do hex2ascii on it:: + + hex2ascii 2F646576 2F636F6E 736F6C65 00 + +outputs:: + + Decoded Hex:=/ d e v / c o n s o l e 0x00 + +We were opening the console device, + +You can compile the code below yourself for practice :-), + +:: + + /* + * hex2ascii.c + * a useful little tool for converting a hexadecimal command line to ascii + * + * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) + * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation. + */ + #include + + int main(int argc,char *argv[]) + { + int cnt1,cnt2,len,toggle=0; + int startcnt=1; + unsigned char c,hex; + + if(argc>1&&(strcmp(argv[1],"-a")==0)) + startcnt=2; + printf("Decoded Hex:="); + for(cnt1=startcnt;cnt1='0'&&c<='9') + c=c-'0'; + if(c>='A'&&c<='F') + c=c-'A'+10; + if(c>='a'&&c<='f') + c=c-'a'+10; + switch(toggle) + { + case 0: + hex=c<<4; + toggle=1; + break; + case 1: + hex+=c; + if(hex<32||hex>127) + { + if(startcnt==1) + printf("0x%02X ",(int)hex); + else + printf("."); + } + else + { + printf("%c",hex); + if(startcnt==1) + printf(" "); + } + toggle=0; + break; + } + } + } + printf("\n"); + } + + + + +Stack tracing under VM +---------------------- +A basic backtrace +----------------- + +Here are the tricks I use 9 out of 10 times it works pretty well, + +When your backchain reaches a dead end +-------------------------------------- +This can happen when an exception happens in the kernel and the kernel is +entered twice. If you reach the NULL pointer at the end of the back chain you +should be able to sniff further back if you follow the following tricks. +1) A kernel address should be easy to recognise since it is in +primary space & the problem state bit isn't set & also +The Hi bit of the address is set. +2) Another backchain should also be easy to recognise since it is an +address pointing to another address approximately 100 bytes or 0x70 hex +behind the current stackpointer. + + +Here is some practice. + +boot the kernel & hit PA1 at some random time + +d g to display the gprs, this should display something like:: + + GPR 0 = 00000001 00156018 0014359C 00000000 + GPR 4 = 00000001 001B8888 000003E0 00000000 + GPR 8 = 00100080 00100084 00000000 000FE000 + GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8 + +Note that GPR14 is a return address but as we are real men we are going to +trace the stack. +display 0x40 bytes after the stack pointer:: + + V000FFED8 000FFF38 8001B838 80014C8E 000FFF38 + V000FFEE8 00000000 00000000 000003E0 00000000 + V000FFEF8 00100080 00100084 00000000 000FE000 + V000FFF08 00010400 8001B2DC 8001B36A 000FFED8 + + +Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if +you look above at our stackframe & also agrees with GPR14. + +now backchain:: + + d 000FFF38.40 + +we now are taking the contents of SP to get our first backchain:: + + V000FFF38 000FFFA0 00000000 00014995 00147094 + V000FFF48 00147090 001470A0 000003E0 00000000 + V000FFF58 00100080 00100084 00000000 001BF1D0 + V000FFF68 00010400 800149BA 80014CA6 000FFF38 + +This displays a 2nd return address of 80014CA6 + +now do:: + + d 000FFFA0.40 + +for our 3rd backchain:: + + V000FFFA0 04B52002 0001107F 00000000 00000000 + V000FFFB0 00000000 00000000 FF000000 0001107F + V000FFFC0 00000000 00000000 00000000 00000000 + V000FFFD0 00010400 80010802 8001085A 000FFFA0 + + +our 3rd return address is 8001085A + +as the 04B52002 looks suspiciously like rubbish it is fair to assume that the +kernel entry routines for the sake of optimisation don't set up a backchain. + +now look at System.map to see if the addresses make any sense:: + + grep -i 0001b3 System.map + +outputs among other things:: + + 0001b304 T cpu_idle + +so 8001B36A +is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it ) + +:: + + grep -i 00014 System.map + +produces among other things:: + + 00014a78 T start_kernel + +so 0014CA6 is start_kernel+some hex number I can't add in my head. + +:: + + grep -i 00108 System.map + +this produces:: + + 00010800 T _stext + +so 8001085A is _stext+0x5a + +Congrats you've done your first backchain. + + + +s/390 & z/Architecture IO Overview +================================== + +I am not going to give a course in 390 IO architecture as this would take me +quite a while and I'm no expert. Instead I'll give a 390 IO architecture +summary for Dummies. If you have the s/390 principles of operation available +read this instead. If nothing else you may find a few useful keywords in here +and be able to use them on a web search engine to find more useful information. + +Unlike other bus architectures modern 390 systems do their IO using mostly +fibre optics and devices such as tapes and disks can be shared between several +mainframes. Also S390 can support up to 65536 devices while a high end PC based +system might be choking with around 64. + +Here is some of the common IO terminology: + +Subchannel: + This is the logical number most IO commands use to talk to an IO device. There + can be up to 0x10000 (65536) of these in a configuration, typically there are a + few hundred. Under VM for simplicity they are allocated contiguously, however + on the native hardware they are not. They typically stay consistent between + boots provided no new hardware is inserted or removed. + + Under Linux for s390 we use these as IRQ's and also when issuing an IO command + (CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL, + START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID + of the device we wish to talk to. The most important of these instructions are + START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO + completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have + up to 8 channel paths to a device, this offers redundancy if one is not + available. + +Device Number: + This number remains static and is closely tied to the hardware. There are 65536 + of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and + another lsb 8 bits. These remain static even if more devices are inserted or + removed from the hardware. There is a 1 to 1 mapping between subchannels and + device numbers, provided devices aren't inserted or removed. + +Channel Control Words: + CCWs are linked lists of instructions initially pointed to by an operation + request block (ORB), which is initially given to Start Subchannel (SSCH) + command along with the subchannel number for the IO subsystem to process + while the CPU continues executing normal code. + CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and + Format 1 (31 bit). These are typically used to issue read and write (and many + other) instructions. They consist of a length field and an absolute address + field. + + Each IO typically gets 1 or 2 interrupts, one for channel end (primary status) + when the channel is idle, and the second for device end (secondary status). + Sometimes you get both concurrently. You check how the IO went on by issuing a + TEST SUBCHANNEL at each interrupt, from which you receive an Interruption + response block (IRB). If you get channel and device end status in the IRB + without channel checks etc. your IO probably went okay. If you didn't you + probably need to examine the IRB, extended status word etc. + If an error occurs, more sophisticated control units have a facility known as + concurrent sense. This means that if an error occurs Extended sense information + will be presented in the Extended status word in the IRB. If not you have to + issue a subsequent SENSE CCW command after the test subchannel. + + +TPI (Test pending interrupt) can also be used for polled IO, but in +multitasking multiprocessor systems it isn't recommended except for +checking special cases (i.e. non looping checks for pending IO etc.). + +Store Subchannel and Modify Subchannel can be used to examine and modify +operating characteristics of a subchannel (e.g. channel paths). + +Other IO related Terms: + +Sysplex: + S390's Clustering Technology +QDIO: + S390's new high speed IO architecture to support devices such as gigabit + ethernet, this architecture is also designed to be forward compatible with + upcoming 64 bit machines. + + +General Concepts +---------------- + +Input Output Processors (IOP's) are responsible for communicating between +the mainframe CPU's & the channel & relieve the mainframe CPU's from the +burden of communicating with IO devices directly, this allows the CPU's to +concentrate on data processing. + +IOP's can use one or more links ( known as channel paths ) to talk to each +IO device. It first checks for path availability & chooses an available one, +then starts ( & sometimes terminates IO ). +There are two types of channel path: ESCON & the Parallel IO interface. + +IO devices are attached to control units, control units provide the +logic to interface the channel paths & channel path IO protocols to +the IO devices, they can be integrated with the devices or housed separately +& often talk to several similar devices ( typical examples would be raid +controllers or a control unit which connects to 1000 3270 terminals ):: + + + +---------------------------------------------------------------+ + | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | + | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | | + | | | | | | | | | | Memory | | Storage | | + | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ | + |---------------------------------------------------------------+ + | IOP | IOP | IOP | + |--------------------------------------------------------------- + | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | + ---------------------------------------------------------------- + || || + || Bus & Tag Channel Path || ESCON + || ====================== || Channel + || || || || Path + +----------+ +----------+ +----------+ + | | | | | | + | CU | | CU | | CU | + | | | | | | + +----------+ +----------+ +----------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + |I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device| + +----------+ +----------+ +----------+ +----------+ +----------+ + CPU = Central Processing Unit + C = Channel + IOP = IP Processor + CU = Control Unit + +The 390 IO systems come in 2 flavours the current 390 machines support both + +The Older 360 & 370 Interface,sometimes called the Parallel I/O interface, +sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers +Interface (OEMI). + +This byte wide Parallel channel path/bus has parity & data on the "Bus" cable +and control lines on the "Tag" cable. These can operate in byte multiplex mode +for sharing between several slow devices or burst mode and monopolize the +channel for the whole burst. Up to 256 devices can be addressed on one of these +cables. These cables are about one inch in diameter. The maximum unextended +length supported by these cables is 125 Meters but this can be extended up to +2km with a fibre optic channel extended such as a 3044. The maximum burst speed +supported is 4.5 megabytes per second. However, some really old processors +support only transfer rates of 3.0, 2.0 & 1.0 MB/sec. +One of these paths can be daisy chained to up to 8 control units. + + +ESCON if fibre optic it is also called FICON +Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or +lasers for communication at a signaling rate of up to 200 megabits/sec. As +10bits are transferred for every 8 bits info this drops to 160 megabits/sec +and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only +operates in burst mode. + +ESCONs typical max cable length is 3km for the led version and 20km for the +laser version known as XDF (extended distance facility). This can be further +extended by using an ESCON director which triples the above mentioned ranges. +Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture, +the standard Bus & Tag control protocol is however present within the packets. +Up to 256 devices can be attached to each control unit that uses one of these +interfaces. + +Common 390 Devices include: +Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters, +Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console). +DASD's direct access storage devices ( otherwise known as hard disks ). +Tape Drives. +CTC ( Channel to Channel Adapters ), +ESCON or Parallel Cables used as a very high speed serial link +between 2 machines. + + +Debugging IO on s/390 & z/Architecture under VM +=============================================== + +Now we are ready to go on with IO tracing commands under VM + +A few self explanatory queries:: + + Q OSA + Q CTC + Q DISK ( This command is CMS specific ) + Q DASD + +Q OSA on my machine returns:: + + OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000 + OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001 + OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002 + OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003 + +If you have a guest with certain privileges you may be able to see devices +which don't belong to you. To avoid this, add the option V. +e.g.:: + + Q V OSA + +Now using the device numbers returned by this command we will +Trace the io starting up on the first device 7c08 & 7c09 +In our simplest case we can trace the +start subchannels +like TR SSCH 7C08-7C09 +or the halt subchannels +or TR HSCH 7C08-7C09 +MSCH's ,STSCH's I think you can guess the rest + +A good trick is tracing all the IO's and CCWS and spooling them into the reader +of another VM guest so he can ftp the logfile back to his own machine. I'll do +a small bit of this and give you a look at the output. + +1) Spool stdout to VM reader:: + + SP PRT TO (another vm guest ) or * for the local vm guest + +2) Fill the reader with the trace:: + + TR IO 7c08-7c09 INST INT CCW PRT RUN + +3) Start up linux:: + + i 00c +4) Finish the trace:: + + TR END + +5) close the reader:: + + C PRT + +6) list reader contents:: + + RDRLIST + +7) copy it to linux4's minidisk:: + + RECEIVE / LOG TXT A1 ( replace + +8) +filel & press F11 to look at it +You should see something like:: + + 00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08 + CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80 + CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........ + IDAL 43D8AFE8 + IDAL 0FB76000 + 00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4 + 00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08 + CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC + KEY 0 FPI C0 CC 0 CTLS 4007 + 00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08 + +If you don't like messing up your readed ( because you possibly booted from it ) +you can alternatively spool it to another readers guest. + + +Other common VM device related commands +--------------------------------------------- +These commands are listed only because they have +been of use to me in the past & may be of use to +you too. For more complete info on each of the commands +use type HELP from CMS. + +detaching devices:: + + DET + ATT + +attach a device to guest * for your own guest + +READY + cause VM to issue a fake interrupt. + +The VARY command is normally only available to VM administrators:: + + VARY ON PATH TO + VARY OFF PATH FROM + +This is used to switch on or off channel paths to devices. + +Q CHPID + This displays state of devices using this channel path + +D SCHIB + This displays the subchannel information SCHIB block for the device. + this I believe is also only available to administrators. + +DEFINE CTC + defines a virtual CTC channel to channel connection + 2 need to be defined on each guest for the CTC driver to use. + +COUPLE devno userid remote devno + Joins a local virtual device to a remote virtual device + ( commonly used for the CTC driver ). + +Building a VM ramdisk under CMS which linux can use:: + + def vfb- + +blocksize is commonly 4096 for linux. + +Formatting it:: + + format (blksize + +Sharing a disk between multiple guests:: + + LINK userid devno1 devno2 mode password + + + +GDB on S390 +=========== +N.B. if compiling for debugging gdb works better without optimisation +( see Compiling programs for debugging ) + +invocation +---------- +gdb + +Online help +----------- +help: gives help on commands + +e.g.:: + + help + help display + +Note gdb's online help is very good use it. + + +Assembly +-------- +info registers: + displays registers other than floating point. + +info all-registers: + displays floating points as well. + +disassemble: + disassembles + +e.g.:: + + disassemble without parameters will disassemble the current function + disassemble $pc $pc+10 + +Viewing & modifying variables +----------------------------- +print or p: + displays variable or register + +e.g. p/x $sp will display the stack pointer + +display: + prints variable or register each time program stops + +e.g.:: + + display/x $pc will display the program counter + display argc + +undisplay: + undo's display's + +info breakpoints: + shows all current breakpoints + +info stack: + shows stack back trace (if this doesn't work too well, I'll show + you the stacktrace by hand below). + +info locals: + displays local variables. + +info args: + display current procedure arguments. + +set args: + will set argc & argv each time the victim program is invoked + +e.g.:: + + set =value + set argc=100 + set $pc=0 + + + +Modifying execution +------------------- +step: + steps n lines of sourcecode + +step + steps 1 line. + +step 100 + steps 100 lines of code. + +next: + like step except this will not step into subroutines + +stepi: + steps a single machine code instruction. + +e.g.:: + + stepi 100 + +nexti: + steps a single machine code instruction but will not step into + subroutines. + +finish: + will run until exit of the current routine + +run: + (re)starts a program + +cont: + continues a program + +quit: + exits gdb. + + +breakpoints +------------ + +break + sets a breakpoint + +e.g.:: + + break main + break *$pc + break *0x400618 + +Here's a really useful one for large programs + +rbr + Set a breakpoint for all functions matching REGEXP + +e.g.:: + + rbr 390 + +will set a breakpoint with all functions with 390 in their name. + +info breakpoints + lists all breakpoints + +delete: + delete breakpoint by number or delete them all + +e.g. + +delete 1 + will delete the first breakpoint + + +delete + will delete them all + +watch: + This will set a watchpoint ( usually hardware assisted ), + +This will watch a variable till it changes + +e.g. + +watch cnt + will watch the variable cnt till it changes. + +As an aside unfortunately gdb's, architecture independent watchpoint code +is inconsistent & not very good, watchpoints usually work but not always. + +info watchpoints: + Display currently active watchpoints + +condition: ( another useful one ) + Specify breakpoint number N to break only if COND is true. + +Usage is `condition N COND`, where N is an integer and COND is an +expression to be evaluated whenever breakpoint N is reached. + + + +User defined functions/macros +----------------------------- +define: ( Note this is very very useful,simple & powerful ) + +usage define end + +examples which you should consider putting into .gdbinit in your home +directory:: + + define d + stepi + disassemble $pc $pc+10 + end + define e + nexti + disassemble $pc $pc+10 + end + + +Other hard to classify stuff +---------------------------- +signal n: + sends the victim program a signal. + +e.g. `signal 3` will send a SIGQUIT. + +info signals: + what gdb does when the victim receives certain signals. + +list: + +e.g.: + +list + lists current function source +list 1,10 + list first 10 lines of current file. + +list test.c:1,10 + + +directory: + Adds directories to be searched for source if gdb cannot find the source. + (note it is a bit sensitive about slashes) + +e.g. To add the root of the filesystem to the searchpath do:: + + directory // + + +call +This calls a function in the victim program, this is pretty powerful +e.g. +(gdb) call printf("hello world") +outputs: +$1 = 11 + +You might now be thinking that the line above didn't work, something extra had +to be done. +(gdb) call fflush(stdout) +hello world$2 = 0 +As an aside the debugger also calls malloc & free under the hood +to make space for the "hello world" string. + + + +hints +----- +1) command completion works just like bash + ( if you are a bad typist like me this really helps ) + +e.g. hit br & cursor up & down :-). + +2) if you have a debugging problem that takes a few steps to recreate +put the steps into a file called .gdbinit in your current working directory +if you have defined a few extra useful user defined commands put these in +your home directory & they will be read each time gdb is launched. + +A typical .gdbinit file might be.:: + + break main + run + break runtime_exception + cont + + +stack chaining in gdb by hand +----------------------------- +This is done using a the same trick described for VM:: + + p/x (*($sp+56))&0x7fffffff + +get the first backchain. + +For z/Architecture +Replace 56 with 112 & ignore the &0x7fffffff +in the macros below & do nasty casts to longs like the following +as gdb unfortunately deals with printed arguments as ints which +messes up everything. + +i.e. here is a 3rd backchain dereference:: + + p/x *(long *)(***(long ***)$sp+112) + + +this outputs:: + + $5 = 0x528f18 + +on my machine. + +Now you can use:: + + info symbol (*($sp+56))&0x7fffffff + +you might see something like:: + + rl_getc + 36 in section .text + +telling you what is located at address 0x528f18 +Now do:: + + p/x (*(*$sp+56))&0x7fffffff + +This outputs:: + + $6 = 0x528ed0 + +Now do:: + + info symbol (*(*$sp+56))&0x7fffffff + rl_read_key + 180 in section .text + +now do:: + + p/x (*(**$sp+56))&0x7fffffff + +& so on. + +Disassembling instructions without debug info +--------------------------------------------- +gdb typically complains if there is a lack of debugging +symbols in the disassemble command with +"No function contains specified address." To get around +this do:: + + x/xi
+ +e.g.:: + + x/20xi 0x400730 + + + +Note: + Remember gdb has history just like bash you don't need to retype the + whole line just use the up & down arrows. + + + +For more info +------------- +From your linuxbox do:: + + man gdb + +or:: + + info gdb. + +core dumps +---------- + +What a core dump ? +^^^^^^^^^^^^^^^^^^ + +A core dump is a file generated by the kernel (if allowed) which contains the +registers and all active pages of the program which has crashed. + +From this file gdb will allow you to look at the registers, stack trace and +memory of the program as if it just crashed on your system. It is usually +called core and created in the current working directory. + +This is very useful in that a customer can mail a core dump to a technical +support department and the technical support department can reconstruct what +happened. Provided they have an identical copy of this program with debugging +symbols compiled in and the source base of this build is available. + +In short it is far more useful than something like a crash log could ever hope +to be. + +Why have I never seen one ? +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Probably because you haven't used the command:: + + ulimit -c unlimited in bash + +to allow core dumps, now do:: + + ulimit -a + +to verify that the limit was accepted. + +A sample core dump + To create this I'm going to do:: + + ulimit -c unlimited + gdb + +to launch gdb (my victim app. ) now be bad & do the following from another +telnet/xterm session to the same machine:: + + ps -aux | grep gdb + kill -SIGSEGV + +or alternatively use `killall -SIGSEGV gdb` if you have the killall command. + +Now look at the core dump:: + + ./gdb core + +Displays the following:: + + GNU gdb 4.18 + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "s390-ibm-linux"... + Core was generated by `./gdb'. + Program terminated with signal 11, Segmentation fault. + Reading symbols from /usr/lib/libncurses.so.4...done. + Reading symbols from /lib/libm.so.6...done. + Reading symbols from /lib/libc.so.6...done. + Reading symbols from /lib/ld-linux.so.2...done. + #0 0x40126d1a in read () from /lib/libc.so.6 + Setting up the environment for debugging gdb. + Breakpoint 1 at 0x4dc6f8: file utils.c, line 471. + Breakpoint 2 at 0x4d87a4: file top.c, line 2609. + (top-gdb) info stack + #0 0x40126d1a in read () from /lib/libc.so.6 + #1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402 + #2 0x528ed0 in rl_read_key () at input.c:381 + #3 0x5167e6 in readline_internal_char () at readline.c:454 + #4 0x5168ee in readline_internal_charloop () at readline.c:507 + #5 0x51692c in readline_internal () at readline.c:521 + #6 0x5164fe in readline (prompt=0x7ffff810) + at readline.c:349 + #7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1, + annotation_suffix=0x4d6b44 "prompt") at top.c:2091 + #8 0x4d6cf0 in command_loop () at top.c:1345 + #9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 + + +LDD +=== +This is a program which lists the shared libraries which a library needs, +Note you also get the relocations of the shared library text segments which +help when using objdump --source. + +e.g.:: + + ldd ./gdb + +outputs:: + + libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000) + libm.so.6 => /lib/libm.so.6 (0x4005e000) + libc.so.6 => /lib/libc.so.6 (0x40084000) + /lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000) + + +Debugging shared libraries +========================== +Most programs use shared libraries, however it can be very painful +when you single step instruction into a function like printf for the +first time & you end up in functions like _dl_runtime_resolve this is +the ld.so doing lazy binding, lazy binding is a concept in ELF where +shared library functions are not loaded into memory unless they are +actually used, great for saving memory but a pain to debug. + +To get around this either relink the program -static or exit gdb type +export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing +the program in question. + + + +Debugging modules +================= +As modules are dynamically loaded into the kernel their address can be +anywhere to get around this use the -m option with insmod to emit a load +map which can be piped into a file if required. + +The proc file system +==================== +What is it ?. +It is a filesystem created by the kernel with files which are created on demand +by the kernel if read, or can be used to modify kernel parameters, +it is a powerful concept. + +e.g.:: + + cat /proc/sys/net/ipv4/ip_forward + +On my machine outputs:: + + 0 + +telling me ip_forwarding is not on to switch it on I can do:: + + echo 1 > /proc/sys/net/ipv4/ip_forward + +cat it again:: + + cat /proc/sys/net/ipv4/ip_forward + +On my machine now outputs:: + + 1 + +IP forwarding is on. + +There is a lot of useful info in here best found by going in and having a look +around, so I'll take you through some entries I consider important. + +All the processes running on the machine have their own entry defined by +/proc/ + +So lets have a look at the init process:: + + cd /proc/1 + cat cmdline + +emits:: + + init [2] + +:: + + cd /proc/1/fd + +This contains numerical entries of all the open files, +some of these you can cat e.g. stdout (2):: + + cat /proc/29/maps + +on my machine emits:: + + 00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash + 00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash + 0047e000-00492000 rwxp 00000000 00:00 0 + 40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so + 40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so + 40016000-40017000 rwxp 00000000 00:00 0 + 40017000-40018000 rw-p 00000000 00:00 0 + 40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8 + 4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8 + 4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so + 4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so + 40111000-40114000 rw-p 00000000 00:00 0 + 40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so + 4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so + 7fffd000-80000000 rwxp ffffe000 00:00 0 + + +Showing us the shared libraries init uses where they are in memory +& memory access permissions for each virtual memory area. + +/proc/1/cwd is a softlink to the current working directory. + +/proc/1/root is the root of the filesystem for this process. + +/proc/1/mem is the current running processes memory which you +can read & write to like a file. + +strace uses this sometimes as it is a bit faster than the +rather inefficient ptrace interface for peeking at DATA. + +:: + + cat status + + Name: init + State: S (sleeping) + Pid: 1 + PPid: 0 + Uid: 0 0 0 0 + Gid: 0 0 0 0 + Groups: + VmSize: 408 kB + VmLck: 0 kB + VmRSS: 208 kB + VmData: 24 kB + VmStk: 8 kB + VmExe: 368 kB + VmLib: 0 kB + SigPnd: 0000000000000000 + SigBlk: 0000000000000000 + SigIgn: 7fffffffd7f0d8fc + SigCgt: 00000000280b2603 + CapInh: 00000000fffffeff + CapPrm: 00000000ffffffff + CapEff: 00000000fffffeff + + User PSW: 070de000 80414146 + task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68 + User GPRS: + 00000400 00000000 0000000b 7ffffa90 + 00000000 00000000 00000000 0045d9f4 + 0045cafc 7ffffa90 7fffff18 0045cb08 + 00010400 804039e8 80403af8 7ffff8b0 + User ACRS: + 00000000 00000000 00000000 00000000 + 00000001 00000000 00000000 00000000 + 00000000 00000000 00000000 00000000 + 00000000 00000000 00000000 00000000 + Kernel BackChain CallChain BackChain CallChain + 004b7ca8 8002bd0c 004b7d18 8002b92c + 004b7db8 8005cd50 004b7e38 8005d12a + 004b7f08 80019114 + +Showing among other things memory usage & status of some signals & +the processes'es registers from the kernel task_structure +as well as a backchain which may be useful if a process crashes +in the kernel for some unknown reason. + +Some driver debugging techniques +================================ +debug feature +------------- +Some of our drivers now support a "debug feature" in +/proc/s390dbf see s390dbf.txt in the linux/Documentation directory +for more info. + +e.g. +to switch on the lcs "debug feature":: + + echo 5 > /proc/s390dbf/lcs/level + +& then after the error occurred:: + + cat /proc/s390dbf/lcs/sprintf >/logfile + +the logfile now contains some information which may help +tech support resolve a problem in the field. + + + +high level debugging network drivers +------------------------------------ +ifconfig is a quite useful command +it gives the current state of network drivers. + +If you suspect your network device driver is dead +one way to check is type:: + + ifconfig + +e.g. tr0 + +You should see something like:: + + ifconfig tr0 + tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48 + inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0 + UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1 + RX packets:246134 errors:0 dropped:0 overruns:0 frame:0 + TX packets:5 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:100 + +if the device doesn't say up +try:: + + /etc/rc.d/init.d/network start + +( this starts the network stack & hopefully calls ifconfig tr0 up ). +ifconfig looks at the output of /proc/net/dev and presents it in a more +presentable form. + +Now ping the device from a machine in the same subnet. + +if the RX packets count & TX packets counts don't increment you probably +have problems. + +next:: + + cat /proc/net/arp + +Do you see any hardware addresses in the cache if not you may have problems. +Next try:: + + ping -c 5 + +i.e. the Bcast field above in the output of +ifconfig. Do you see any replies from machines other than the local machine +if not you may have problems. also if the TX packets count in ifconfig +hasn't incremented either you have serious problems in your driver +(e.g. the txbusy field of the network device being stuck on ) +or you may have multiple network devices connected. + + +chandev +------- +There is a new device layer for channel devices, some +drivers e.g. lcs are registered with this layer. + +If the device uses the channel device layer you'll be +able to find what interrupts it uses & the current state +of the device. + +See the manpage chandev.8 &type cat /proc/chandev for more info. + + +SysRq +===== +This is now supported by linux for s/390 & z/Architecture. + +To enable it do compile the kernel with:: + + Kernel Hacking -> Magic SysRq Key Enabled + +Then:: + + echo "1" > /proc/sys/kernel/sysrq + +also type:: + + echo "8" >/proc/sys/kernel/printk + +To make printk output go to console. + +On 390 all commands are prefixed with:: + + ^- + +e.g.:: + + ^-t will show tasks. + ^-? or some unknown command will display help. + +The sysrq key reading is very picky ( I have to type the keys in an +xterm session & paste them into the x3270 console ) +& it may be wise to predefine the keys as described in the VM hints above + +This is particularly useful for syncing disks unmounting & rebooting +if the machine gets partially hung. + +Read Documentation/admin-guide/sysrq.rst for more info + +References: +=========== +- Enterprise Systems Architecture Reference Summary +- Enterprise Systems Architecture Principles of Operation +- Hartmut Penners s390 stack frame sheet. +- IBM Mainframe Channel Attachment a technology brief from a CISCO webpage +- Various bits of man & info pages of Linux. +- Linux & GDB source. +- Various info & man pages. +- CMS Help on tracing commands. +- Linux for s/390 Elf Application Binary Interface +- Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended ) +- z/Architecture Principles of Operation SA22-7832-00 +- Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the +- Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05 + +Special Thanks +============== +Special thanks to Neale Ferguson who maintains a much +prettier HTML version of this page at +http://linuxvm.org/penguinvm/ +Bob Grainger Stefan Bader & others for reporting bugs diff --git a/Documentation/s390/driver-model.rst b/Documentation/s390/driver-model.rst new file mode 100644 index 000000000000..ad4bc2dbea43 --- /dev/null +++ b/Documentation/s390/driver-model.rst @@ -0,0 +1,328 @@ +============================= +S/390 driver model interfaces +============================= + +1. CCW devices +-------------- + +All devices which can be addressed by means of ccws are called 'CCW devices' - +even if they aren't actually driven by ccws. + +All ccw devices are accessed via a subchannel, this is reflected in the +structures under devices/:: + + devices/ + - system/ + - css0/ + - 0.0.0000/0.0.0815/ + - 0.0.0001/0.0.4711/ + - 0.0.0002/ + - 0.1.0000/0.1.1234/ + ... + - defunct/ + +In this example, device 0815 is accessed via subchannel 0 in subchannel set 0, +device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O +subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1. + +The subchannel named 'defunct' does not represent any real subchannel on the +system; it is a pseudo subchannel where disconnected ccw devices are moved to +if they are displaced by another ccw device becoming operational on their +former subchannel. The ccw devices will be moved again to a proper subchannel +if they become operational again on that subchannel. + +You should address a ccw device via its bus id (e.g. 0.0.4711); the device can +be found under bus/ccw/devices/. + +All ccw devices export some data via sysfs. + +cutype: + The control unit type / model. + +devtype: + The device type / model, if applicable. + +availability: + Can be 'good' or 'boxed'; 'no path' or 'no device' for + disconnected devices. + +online: + An interface to set the device online and offline. + In the special case of the device being disconnected (see the + notify function under 1.2), piping 0 to online will forcibly delete + the device. + +The device drivers can add entries to export per-device data and interfaces. + +There is also some data exported on a per-subchannel basis (see under +bus/css/devices/): + +chpids: + Via which chpids the device is connected. + +pimpampom: + The path installed, path available and path operational masks. + +There also might be additional data, for example for block devices. + + +1.1 Bringing up a ccw device +---------------------------- + +This is done in several steps. + +a. Each driver can provide one or more parameter interfaces where parameters can + be specified. These interfaces are also in the driver's responsibility. +b. After a. has been performed, if necessary, the device is finally brought up + via the 'online' interface. + + +1.2 Writing a driver for ccw devices +------------------------------------ + +The basic struct ccw_device and struct ccw_driver data structures can be found +under include/asm/ccwdev.h:: + + struct ccw_device { + spinlock_t *ccwlock; + struct ccw_device_private *private; + struct ccw_device_id id; + + struct ccw_driver *drv; + struct device dev; + int online; + + void (*handler) (struct ccw_device *dev, unsigned long intparm, + struct irb *irb); + }; + + struct ccw_driver { + struct module *owner; + struct ccw_device_id *ids; + int (*probe) (struct ccw_device *); + int (*remove) (struct ccw_device *); + int (*set_online) (struct ccw_device *); + int (*set_offline) (struct ccw_device *); + int (*notify) (struct ccw_device *, int); + struct device_driver driver; + char *name; + }; + +The 'private' field contains data needed for internal i/o operation only, and +is not available to the device driver. + +Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models +and/or device types/models it is interested. This information can later be found +in the struct ccw_device_id fields:: + + struct ccw_device_id { + __u16 match_flags; + + __u16 cu_type; + __u16 dev_type; + __u8 cu_model; + __u8 dev_model; + + unsigned long driver_info; + }; + +The functions in ccw_driver should be used in the following way: + +probe: + This function is called by the device layer for each device the driver + is interested in. The driver should only allocate private structures + to put in dev->driver_data and create attributes (if needed). Also, + the interrupt handler (see below) should be set here. + +:: + + int (*probe) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be probed. + + +remove: + This function is called by the device layer upon removal of the driver, + the device or the module. The driver should perform cleanups here. + +:: + + int (*remove) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be removed. + + +set_online: + This function is called by the common I/O layer when the device is + activated via the 'online' attribute. The driver should finally + setup and activate the device here. + +:: + + int (*set_online) (struct ccw_device *); + +Parameters: + cdev + - the device to be activated. The common layer has + verified that the device is not already online. + + +set_offline: This function is called by the common I/O layer when the device is + de-activated via the 'online' attribute. The driver should shut + down the device, but not de-allocate its private data. + +:: + + int (*set_offline) (struct ccw_device *); + +Parameters: + cdev + - the device to be deactivated. The common layer has + verified that the device is online. + + +notify: + This function is called by the common I/O layer for some state changes + of the device. + + Signalled to the driver are: + + * In online state, device detached (CIO_GONE) or last path gone + (CIO_NO_PATH). The driver must return !0 to keep the device; for + return code 0, the device will be deleted as usual (also when no + notify function is registered). If the driver wants to keep the + device, it is moved into disconnected state. + * In disconnected state, device operational again (CIO_OPER). The + common I/O layer performs some sanity checks on device number and + Device / CU to be reasonably sure if it is still the same device. + If not, the old device is removed and a new one registered. By the + return code of the notify function the device driver signals if it + wants the device back: !0 for keeping, 0 to make the device being + removed and re-registered. + +:: + + int (*notify) (struct ccw_device *, int); + +Parameters: + cdev + - the device whose state changed. + + event + - the event that happened. This can be one of CIO_GONE, + CIO_NO_PATH or CIO_OPER. + +The handler field of the struct ccw_device is meant to be set to the interrupt +handler for the device. In order to accommodate drivers which use several +distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device +instead of ccw_driver. +The handler is registered with the common layer during set_online() processing +before the driver is called, and is deregistered during set_offline() after the +driver has been called. Also, after registering / before deregistering, path +grouping resp. disbanding of the path group (if applicable) are performed. + +:: + + void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); + +Parameters: dev - the device the handler is called for + intparm - the intparm which allows the device driver to identify + the i/o the interrupt is associated with, or to recognize + the interrupt as unsolicited. + irb - interruption response block which contains the accumulated + status. + +The device driver is called from the common ccw_device layer and can retrieve +information about the interrupt from the irb parameter. + + +1.3 ccwgroup devices +-------------------- + +The ccwgroup mechanism is designed to handle devices consisting of multiple ccw +devices, like lcs or ctc. + +The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to +this attributes creates a ccwgroup device consisting of these ccw devices (if +possible). This ccwgroup device can be set online or offline just like a normal +ccw device. + +Each ccwgroup device also provides an 'ungroup' attribute to destroy the device +again (only when offline). This is a generic ccwgroup mechanism (the driver does +not need to implement anything beyond normal removal routines). + +A ccw device which is a member of a ccwgroup device carries a pointer to the +ccwgroup device in the driver_data of its device struct. This field must not be +touched by the driver - it should use the ccwgroup device's driver_data for its +private data. + +To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in +mind that most drivers will need to implement both a ccwgroup and a ccw +driver. + + +2. Channel paths +----------------- + +Channel paths show up, like subchannels, under the channel subsystem root (css0) +and are called 'chp0.'. They have no driver and do not belong to any bus. +Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect +only the logical state and not the physical state, since we cannot track the +latter consistently due to lacking machine support (we don't need to be aware +of it anyway). + +status + - Can be 'online' or 'offline'. + Piping 'on' or 'off' sets the chpid logically online/offline. + Piping 'on' to an online chpid triggers path reprobing for all devices + the chpid connects to. This can be used to force the kernel to re-use + a channel path the user knows to be online, but the machine hasn't + created a machine check for. + +type + - The physical type of the channel path. + +shared + - Whether the channel path is shared. + +cmg + - The channel measurement group. + +3. System devices +----------------- + +3.1 xpram +--------- + +xpram shows up under devices/system/ as 'xpram'. + +3.2 cpus +-------- + +For each cpu, a directory is created under devices/system/cpu/. Each cpu has an +attribute 'online' which can be 0 or 1. + + +4. Other devices +---------------- + +4.1 Netiucv +----------- + +The netiucv driver creates an attribute 'connection' under +bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv +connection to the specified host. + +Netiucv connections show up under devices/iucv/ as "netiucv". The interface +number is assigned sequentially to the connections defined via the 'connection' +attribute. + +user + - shows the connection partner. + +buffer + - maximum buffer size. Pipe to it to change buffer size. diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt deleted file mode 100644 index ed265cf54cde..000000000000 --- a/Documentation/s390/driver-model.txt +++ /dev/null @@ -1,287 +0,0 @@ -S/390 driver model interfaces ------------------------------ - -1. CCW devices --------------- - -All devices which can be addressed by means of ccws are called 'CCW devices' - -even if they aren't actually driven by ccws. - -All ccw devices are accessed via a subchannel, this is reflected in the -structures under devices/: - -devices/ - - system/ - - css0/ - - 0.0.0000/0.0.0815/ - - 0.0.0001/0.0.4711/ - - 0.0.0002/ - - 0.1.0000/0.1.1234/ - ... - - defunct/ - -In this example, device 0815 is accessed via subchannel 0 in subchannel set 0, -device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O -subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1. - -The subchannel named 'defunct' does not represent any real subchannel on the -system; it is a pseudo subchannel where disconnected ccw devices are moved to -if they are displaced by another ccw device becoming operational on their -former subchannel. The ccw devices will be moved again to a proper subchannel -if they become operational again on that subchannel. - -You should address a ccw device via its bus id (e.g. 0.0.4711); the device can -be found under bus/ccw/devices/. - -All ccw devices export some data via sysfs. - -cutype: The control unit type / model. - -devtype: The device type / model, if applicable. - -availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for - disconnected devices. - -online: An interface to set the device online and offline. - In the special case of the device being disconnected (see the - notify function under 1.2), piping 0 to online will forcibly delete - the device. - -The device drivers can add entries to export per-device data and interfaces. - -There is also some data exported on a per-subchannel basis (see under -bus/css/devices/): - -chpids: Via which chpids the device is connected. - -pimpampom: The path installed, path available and path operational masks. - -There also might be additional data, for example for block devices. - - -1.1 Bringing up a ccw device ----------------------------- - -This is done in several steps. - -a. Each driver can provide one or more parameter interfaces where parameters can - be specified. These interfaces are also in the driver's responsibility. -b. After a. has been performed, if necessary, the device is finally brought up - via the 'online' interface. - - -1.2 Writing a driver for ccw devices ------------------------------------- - -The basic struct ccw_device and struct ccw_driver data structures can be found -under include/asm/ccwdev.h. - -struct ccw_device { - spinlock_t *ccwlock; - struct ccw_device_private *private; - struct ccw_device_id id; - - struct ccw_driver *drv; - struct device dev; - int online; - - void (*handler) (struct ccw_device *dev, unsigned long intparm, - struct irb *irb); -}; - -struct ccw_driver { - struct module *owner; - struct ccw_device_id *ids; - int (*probe) (struct ccw_device *); - int (*remove) (struct ccw_device *); - int (*set_online) (struct ccw_device *); - int (*set_offline) (struct ccw_device *); - int (*notify) (struct ccw_device *, int); - struct device_driver driver; - char *name; -}; - -The 'private' field contains data needed for internal i/o operation only, and -is not available to the device driver. - -Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models -and/or device types/models it is interested. This information can later be found -in the struct ccw_device_id fields: - -struct ccw_device_id { - __u16 match_flags; - - __u16 cu_type; - __u16 dev_type; - __u8 cu_model; - __u8 dev_model; - - unsigned long driver_info; -}; - -The functions in ccw_driver should be used in the following way: -probe: This function is called by the device layer for each device the driver - is interested in. The driver should only allocate private structures - to put in dev->driver_data and create attributes (if needed). Also, - the interrupt handler (see below) should be set here. - -int (*probe) (struct ccw_device *cdev); - -Parameters: cdev - the device to be probed. - - -remove: This function is called by the device layer upon removal of the driver, - the device or the module. The driver should perform cleanups here. - -int (*remove) (struct ccw_device *cdev); - -Parameters: cdev - the device to be removed. - - -set_online: This function is called by the common I/O layer when the device is - activated via the 'online' attribute. The driver should finally - setup and activate the device here. - -int (*set_online) (struct ccw_device *); - -Parameters: cdev - the device to be activated. The common layer has - verified that the device is not already online. - - -set_offline: This function is called by the common I/O layer when the device is - de-activated via the 'online' attribute. The driver should shut - down the device, but not de-allocate its private data. - -int (*set_offline) (struct ccw_device *); - -Parameters: cdev - the device to be deactivated. The common layer has - verified that the device is online. - - -notify: This function is called by the common I/O layer for some state changes - of the device. - Signalled to the driver are: - * In online state, device detached (CIO_GONE) or last path gone - (CIO_NO_PATH). The driver must return !0 to keep the device; for - return code 0, the device will be deleted as usual (also when no - notify function is registered). If the driver wants to keep the - device, it is moved into disconnected state. - * In disconnected state, device operational again (CIO_OPER). The - common I/O layer performs some sanity checks on device number and - Device / CU to be reasonably sure if it is still the same device. - If not, the old device is removed and a new one registered. By the - return code of the notify function the device driver signals if it - wants the device back: !0 for keeping, 0 to make the device being - removed and re-registered. - -int (*notify) (struct ccw_device *, int); - -Parameters: cdev - the device whose state changed. - event - the event that happened. This can be one of CIO_GONE, - CIO_NO_PATH or CIO_OPER. - -The handler field of the struct ccw_device is meant to be set to the interrupt -handler for the device. In order to accommodate drivers which use several -distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device -instead of ccw_driver. -The handler is registered with the common layer during set_online() processing -before the driver is called, and is deregistered during set_offline() after the -driver has been called. Also, after registering / before deregistering, path -grouping resp. disbanding of the path group (if applicable) are performed. - -void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); - -Parameters: dev - the device the handler is called for - intparm - the intparm which allows the device driver to identify - the i/o the interrupt is associated with, or to recognize - the interrupt as unsolicited. - irb - interruption response block which contains the accumulated - status. - -The device driver is called from the common ccw_device layer and can retrieve -information about the interrupt from the irb parameter. - - -1.3 ccwgroup devices --------------------- - -The ccwgroup mechanism is designed to handle devices consisting of multiple ccw -devices, like lcs or ctc. - -The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to -this attributes creates a ccwgroup device consisting of these ccw devices (if -possible). This ccwgroup device can be set online or offline just like a normal -ccw device. - -Each ccwgroup device also provides an 'ungroup' attribute to destroy the device -again (only when offline). This is a generic ccwgroup mechanism (the driver does -not need to implement anything beyond normal removal routines). - -A ccw device which is a member of a ccwgroup device carries a pointer to the -ccwgroup device in the driver_data of its device struct. This field must not be -touched by the driver - it should use the ccwgroup device's driver_data for its -private data. - -To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in -mind that most drivers will need to implement both a ccwgroup and a ccw -driver. - - -2. Channel paths ------------------ - -Channel paths show up, like subchannels, under the channel subsystem root (css0) -and are called 'chp0.'. They have no driver and do not belong to any bus. -Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect -only the logical state and not the physical state, since we cannot track the -latter consistently due to lacking machine support (we don't need to be aware -of it anyway). - -status - Can be 'online' or 'offline'. - Piping 'on' or 'off' sets the chpid logically online/offline. - Piping 'on' to an online chpid triggers path reprobing for all devices - the chpid connects to. This can be used to force the kernel to re-use - a channel path the user knows to be online, but the machine hasn't - created a machine check for. - -type - The physical type of the channel path. - -shared - Whether the channel path is shared. - -cmg - The channel measurement group. - -3. System devices ------------------ - -3.1 xpram ---------- - -xpram shows up under devices/system/ as 'xpram'. - -3.2 cpus --------- - -For each cpu, a directory is created under devices/system/cpu/. Each cpu has an -attribute 'online' which can be 0 or 1. - - -4. Other devices ----------------- - -4.1 Netiucv ------------ - -The netiucv driver creates an attribute 'connection' under -bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv -connection to the specified host. - -Netiucv connections show up under devices/iucv/ as "netiucv". The interface -number is assigned sequentially to the connections defined via the 'connection' -attribute. - -user - shows the connection partner. - -buffer - maximum buffer size. - Pipe to it to change buffer size. - - diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst new file mode 100644 index 000000000000..1a914da2a07b --- /dev/null +++ b/Documentation/s390/index.rst @@ -0,0 +1,30 @@ +:orphan: + +================= +s390 Architecture +================= + +.. toctree:: + :maxdepth: 1 + + cds + 3270 + debugging390 + driver-model + monreader + qeth + s390dbf + vfio-ap + vfio-ccw + zfcpdump + dasd + common_io + + text_files + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/s390/monreader.rst b/Documentation/s390/monreader.rst new file mode 100644 index 000000000000..1e857575c113 --- /dev/null +++ b/Documentation/s390/monreader.rst @@ -0,0 +1,212 @@ +================================================= +Linux API for read access to z/VM Monitor Records +================================================= + +Date : 2004-Nov-26 + +Author: Gerald Schaefer (geraldsc@de.ibm.com) + + + + +Description +=========== +This item delivers a new Linux API in the form of a misc char device that is +usable from user space and allows read access to the z/VM Monitor Records +collected by the `*MONITOR` System Service of z/VM. + + +User Requirements +================= +The z/VM guest on which you want to access this API needs to be configured in +order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the +IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is +restricted (likely), you also need the NAMESAVE statement. +This item will use the IUCV device driver to access the z/VM services, so you +need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1. + +There are two options for being able to load the monitor DCSS (examples assume +that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the +location of the monitor DCSS with the Class E privileged CP command Q NSS MAP +(the values BEGPAG and ENDPAG are given in units of 4K pages). + +See also "CP Command and Utility Reference" (SC24-6081-00) for more information +on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning +and Administration" (SC24-6116-00) for more information on DCSSes. + +1st option: +----------- +You can use the CP command DEF STOR CONFIG to define a "memory hole" in your +guest virtual storage around the address range of the DCSS. + +Example: DEF STOR CONFIG 0.140M 200M.200M + +This defines two blocks of storage, the first is 140MB in size an begins at +address 0MB, the second is 200MB in size and begins at address 200MB, +resulting in a total storage of 340MB. Note that the first block should +always start at 0 and be at least 64MB in size. + +2nd option: +----------- +Your guest virtual storage has to end below the starting address of the DCSS +and you have to specify the "mem=" kernel parameter in your parmfile with a +value greater than the ending address of the DCSS. + +Example:: + + DEF STOR 140M + +This defines 140MB storage size for your guest, the parameter "mem=160M" is +added to the parmfile. + + +User Interface +============== +The char device is implemented as a kernel module named "monreader", +which can be loaded via the modprobe command, or it can be compiled into the +kernel instead. There is one optional module (or kernel) parameter, "mondcss", +to specify the name of the monitor DCSS. If the module is compiled into the +kernel, the kernel parameter "monreader.mondcss=" can be specified +in the parmfile. + +The default name for the DCSS is "MONDCSS" if none is specified. In case that +there are other users already connected to the `*MONITOR` service (e.g. +Performance Toolkit), the monitor DCSS is already defined and you have to use +the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name +of the monitor DCSS, if already defined, and the users connected to the +`*MONITOR` service. +Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor +DCSS if your z/VM doesn't have one already, you need Class E privileges to +define and save a DCSS. + +Example: +-------- + +:: + + modprobe monreader mondcss=MYDCSS + +This loads the module and sets the DCSS name to "MYDCSS". + +NOTE: +----- +This API provides no interface to control the `*MONITOR` service, e.g. specify +which data should be collected. This can be done by the CP command MONITOR +(Class E privileged), see "CP Command and Utility Reference". + +Device nodes with udev: +----------------------- +After loading the module, a char device will be created along with the device +node //monreader. + +Device nodes without udev: +-------------------------- +If your distribution does not support udev, a device node will not be created +automatically and you have to create it manually after loading the module. +Therefore you need to know the major and minor numbers of the device. These +numbers can be found in /sys/class/misc/monreader/dev. + +Typing cat /sys/class/misc/monreader/dev will give an output of the form +:. The device node can be created via the mknod command, enter +mknod c , where is the name of the device node +to be created. + +Example: +-------- + +:: + + # modprobe monreader + # cat /sys/class/misc/monreader/dev + 10:63 + # mknod /dev/monreader c 10 63 + +This loads the module with the default monitor DCSS (MONDCSS) and creates a +device node. + +File operations: +---------------- +The following file operations are supported: open, release, read, poll. +There are two alternative methods for reading: either non-blocking read in +conjunction with polling, or blocking read without polling. IOCTLs are not +supported. + +Read: +----- +Reading from the device provides a 12 Byte monitor control element (MCE), +followed by a set of one or more contiguous monitor records (similar to the +output of the CMS utility MONWRITE without the 4K control blocks). The MCE +contains information on the type of the following record set (sample/event +data), the monitor domains contained within it and the start and end address +of the record set in the monitor DCSS. The start and end address can be used +to determine the size of the record set, the end address is the address of the +last byte of data. The start address is needed to handle "end-of-frame" records +correctly (domain 1, record 13), i.e. it can be used to determine the record +start offset relative to a 4K page (frame) boundary. + +See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description +of the monitor control element layout. The layout of the monitor records can +be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html + +The layout of the data stream provided by the monreader device is as follows:: + + ... + <0 byte read> + \ + | + ... |- data set + | + / + <0 byte read> + ... + +There may be more than one combination of MCE and corresponding record set +within one data set and the end of each data set is indicated by a successful +read with a return value of 0 (0 byte read). +Any received data must be considered invalid until a complete set was +read successfully, including the closing 0 byte read. Therefore you should +always read the complete set into a buffer before processing the data. + +The maximum size of a data set can be as large as the size of the +monitor DCSS, so design the buffer adequately or use dynamic memory allocation. +The size of the monitor DCSS will be printed into syslog after loading the +module. You can also use the (Class E privileged) CP command Q NSS MAP to +list all available segments and information about them. + +As with most char devices, error conditions are indicated by returning a +negative value for the number of bytes read. In this case, the errno variable +indicates the error condition: + +EIO: + reply failed, read data is invalid and the application + should discard the data read since the last successful read with 0 size. +EFAULT: + copy_to_user failed, read data is invalid and the application should + discard the data read since the last successful read with 0 size. +EAGAIN: + occurs on a non-blocking read if there is no data available at the + moment. There is no data missing or corrupted, just try again or rather + use polling for non-blocking reads. +EOVERFLOW: + message limit reached, the data read since the last successful + read with 0 size is valid but subsequent records may be missing. + +In the last case (EOVERFLOW) there may be missing data, in the first two cases +(EIO, EFAULT) there will be missing data. It's up to the application if it will +continue reading subsequent data or rather exit. + +Open: +----- +Only one user is allowed to open the char device. If it is already in use, the +open function will fail (return a negative value) and set errno to EBUSY. +The open function may also fail if an IUCV connection to the `*MONITOR` service +cannot be established. In this case errno will be set to EIO and an error +message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER +codes are described in the "z/VM Performance" book, Appendix A. + +NOTE: +----- +As soon as the device is opened, incoming messages will be accepted and they +will account for the message limit, i.e. opening the device without reading +from it will provoke the "message limit reached" error (EOVERFLOW error code) +eventually. diff --git a/Documentation/s390/monreader.txt b/Documentation/s390/monreader.txt deleted file mode 100644 index d3729585fdb0..000000000000 --- a/Documentation/s390/monreader.txt +++ /dev/null @@ -1,197 +0,0 @@ - -Date : 2004-Nov-26 -Author: Gerald Schaefer (geraldsc@de.ibm.com) - - - Linux API for read access to z/VM Monitor Records - ================================================= - - -Description -=========== -This item delivers a new Linux API in the form of a misc char device that is -usable from user space and allows read access to the z/VM Monitor Records -collected by the *MONITOR System Service of z/VM. - - -User Requirements -================= -The z/VM guest on which you want to access this API needs to be configured in -order to allow IUCV connections to the *MONITOR service, i.e. it needs the -IUCV *MONITOR statement in its user entry. If the monitor DCSS to be used is -restricted (likely), you also need the NAMESAVE statement. -This item will use the IUCV device driver to access the z/VM services, so you -need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1. - -There are two options for being able to load the monitor DCSS (examples assume -that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the -location of the monitor DCSS with the Class E privileged CP command Q NSS MAP -(the values BEGPAG and ENDPAG are given in units of 4K pages). - -See also "CP Command and Utility Reference" (SC24-6081-00) for more information -on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning -and Administration" (SC24-6116-00) for more information on DCSSes. - -1st option: ------------ -You can use the CP command DEF STOR CONFIG to define a "memory hole" in your -guest virtual storage around the address range of the DCSS. - -Example: DEF STOR CONFIG 0.140M 200M.200M - -This defines two blocks of storage, the first is 140MB in size an begins at -address 0MB, the second is 200MB in size and begins at address 200MB, -resulting in a total storage of 340MB. Note that the first block should -always start at 0 and be at least 64MB in size. - -2nd option: ------------ -Your guest virtual storage has to end below the starting address of the DCSS -and you have to specify the "mem=" kernel parameter in your parmfile with a -value greater than the ending address of the DCSS. - -Example: DEF STOR 140M - -This defines 140MB storage size for your guest, the parameter "mem=160M" is -added to the parmfile. - - -User Interface -============== -The char device is implemented as a kernel module named "monreader", -which can be loaded via the modprobe command, or it can be compiled into the -kernel instead. There is one optional module (or kernel) parameter, "mondcss", -to specify the name of the monitor DCSS. If the module is compiled into the -kernel, the kernel parameter "monreader.mondcss=" can be specified -in the parmfile. - -The default name for the DCSS is "MONDCSS" if none is specified. In case that -there are other users already connected to the *MONITOR service (e.g. -Performance Toolkit), the monitor DCSS is already defined and you have to use -the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name -of the monitor DCSS, if already defined, and the users connected to the -*MONITOR service. -Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor -DCSS if your z/VM doesn't have one already, you need Class E privileges to -define and save a DCSS. - -Example: --------- -modprobe monreader mondcss=MYDCSS - -This loads the module and sets the DCSS name to "MYDCSS". - -NOTE: ------ -This API provides no interface to control the *MONITOR service, e.g. specify -which data should be collected. This can be done by the CP command MONITOR -(Class E privileged), see "CP Command and Utility Reference". - -Device nodes with udev: ------------------------ -After loading the module, a char device will be created along with the device -node //monreader. - -Device nodes without udev: --------------------------- -If your distribution does not support udev, a device node will not be created -automatically and you have to create it manually after loading the module. -Therefore you need to know the major and minor numbers of the device. These -numbers can be found in /sys/class/misc/monreader/dev. -Typing cat /sys/class/misc/monreader/dev will give an output of the form -:. The device node can be created via the mknod command, enter -mknod c , where is the name of the device node -to be created. - -Example: --------- -# modprobe monreader -# cat /sys/class/misc/monreader/dev -10:63 -# mknod /dev/monreader c 10 63 - -This loads the module with the default monitor DCSS (MONDCSS) and creates a -device node. - -File operations: ----------------- -The following file operations are supported: open, release, read, poll. -There are two alternative methods for reading: either non-blocking read in -conjunction with polling, or blocking read without polling. IOCTLs are not -supported. - -Read: ------ -Reading from the device provides a 12 Byte monitor control element (MCE), -followed by a set of one or more contiguous monitor records (similar to the -output of the CMS utility MONWRITE without the 4K control blocks). The MCE -contains information on the type of the following record set (sample/event -data), the monitor domains contained within it and the start and end address -of the record set in the monitor DCSS. The start and end address can be used -to determine the size of the record set, the end address is the address of the -last byte of data. The start address is needed to handle "end-of-frame" records -correctly (domain 1, record 13), i.e. it can be used to determine the record -start offset relative to a 4K page (frame) boundary. - -See "Appendix A: *MONITOR" in the "z/VM Performance" document for a description -of the monitor control element layout. The layout of the monitor records can -be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html - -The layout of the data stream provided by the monreader device is as follows: -... -<0 byte read> - \ - | -... |- data set - | - / -<0 byte read> -... - -There may be more than one combination of MCE and corresponding record set -within one data set and the end of each data set is indicated by a successful -read with a return value of 0 (0 byte read). -Any received data must be considered invalid until a complete set was -read successfully, including the closing 0 byte read. Therefore you should -always read the complete set into a buffer before processing the data. - -The maximum size of a data set can be as large as the size of the -monitor DCSS, so design the buffer adequately or use dynamic memory allocation. -The size of the monitor DCSS will be printed into syslog after loading the -module. You can also use the (Class E privileged) CP command Q NSS MAP to -list all available segments and information about them. - -As with most char devices, error conditions are indicated by returning a -negative value for the number of bytes read. In this case, the errno variable -indicates the error condition: - -EIO: reply failed, read data is invalid and the application - should discard the data read since the last successful read with 0 size. -EFAULT: copy_to_user failed, read data is invalid and the application should - discard the data read since the last successful read with 0 size. -EAGAIN: occurs on a non-blocking read if there is no data available at the - moment. There is no data missing or corrupted, just try again or rather - use polling for non-blocking reads. -EOVERFLOW: message limit reached, the data read since the last successful - read with 0 size is valid but subsequent records may be missing. - -In the last case (EOVERFLOW) there may be missing data, in the first two cases -(EIO, EFAULT) there will be missing data. It's up to the application if it will -continue reading subsequent data or rather exit. - -Open: ------ -Only one user is allowed to open the char device. If it is already in use, the -open function will fail (return a negative value) and set errno to EBUSY. -The open function may also fail if an IUCV connection to the *MONITOR service -cannot be established. In this case errno will be set to EIO and an error -message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER -codes are described in the "z/VM Performance" book, Appendix A. - -NOTE: ------ -As soon as the device is opened, incoming messages will be accepted and they -will account for the message limit, i.e. opening the device without reading -from it will provoke the "message limit reached" error (EOVERFLOW error code) -eventually. - diff --git a/Documentation/s390/qeth.rst b/Documentation/s390/qeth.rst new file mode 100644 index 000000000000..f02fdaa68de0 --- /dev/null +++ b/Documentation/s390/qeth.rst @@ -0,0 +1,64 @@ +============================= +IBM s390 QDIO Ethernet Driver +============================= + +OSA and HiperSockets Bridge Port Support +======================================== + +Uevents +------- + +To generate the events the device must be assigned a role of either +a primary or a secondary Bridge Port. For more information, see +"z/VM Connectivity, SC24-6174". + +When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state +of some configured Bridge Port device on the channel changes, a udev +event with ACTION=CHANGE is emitted on behalf of the corresponding +ccwgroup device. The event has the following attributes: + +BRIDGEPORT=statechange + indicates that the Bridge Port device changed + its state. + +ROLE={primary|secondary|none} + the role assigned to the port. + +STATE={active|standby|inactive} + the newly assumed state of the port. + +When run on HiperSockets Bridge Capable Port hardware with host address +notifications enabled, a udev event with ACTION=CHANGE is emitted. +It is emitted on behalf of the corresponding ccwgroup device when a host +or a VLAN is registered or unregistered on the network served by the device. +The event has the following attributes: + +BRIDGEDHOST={reset|register|deregister|abort} + host address + notifications are started afresh, a new host or VLAN is registered or + deregistered on the Bridge Port HiperSockets channel, or address + notifications are aborted. + +VLAN=numeric-vlan-id + VLAN ID on which the event occurred. Not included + if no VLAN is involved in the event. + +MAC=xx:xx:xx:xx:xx:xx + MAC address of the host that is being registered + or deregistered from the HiperSockets channel. Not reported if the + event reports the creation or destruction of a VLAN. + +NTOK_BUSID=x.y.zzzz + device bus ID (CSSID, SSID and device number). + +NTOK_IID=xx + device IID. + +NTOK_CHPID=xx + device CHPID. + +NTOK_CHID=xxxx + device channel ID. + +Note that the `NTOK_*` attributes refer to devices other than the one +connected to the system on which the OS is running. diff --git a/Documentation/s390/qeth.txt b/Documentation/s390/qeth.txt deleted file mode 100644 index aa06fcf5f8c2..000000000000 --- a/Documentation/s390/qeth.txt +++ /dev/null @@ -1,50 +0,0 @@ -IBM s390 QDIO Ethernet Driver - -OSA and HiperSockets Bridge Port Support - -Uevents - -To generate the events the device must be assigned a role of either -a primary or a secondary Bridge Port. For more information, see -"z/VM Connectivity, SC24-6174". - -When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state -of some configured Bridge Port device on the channel changes, a udev -event with ACTION=CHANGE is emitted on behalf of the corresponding -ccwgroup device. The event has the following attributes: - -BRIDGEPORT=statechange - indicates that the Bridge Port device changed - its state. - -ROLE={primary|secondary|none} - the role assigned to the port. - -STATE={active|standby|inactive} - the newly assumed state of the port. - -When run on HiperSockets Bridge Capable Port hardware with host address -notifications enabled, a udev event with ACTION=CHANGE is emitted. -It is emitted on behalf of the corresponding ccwgroup device when a host -or a VLAN is registered or unregistered on the network served by the device. -The event has the following attributes: - -BRIDGEDHOST={reset|register|deregister|abort} - host address - notifications are started afresh, a new host or VLAN is registered or - deregistered on the Bridge Port HiperSockets channel, or address - notifications are aborted. - -VLAN=numeric-vlan-id - VLAN ID on which the event occurred. Not included - if no VLAN is involved in the event. - -MAC=xx:xx:xx:xx:xx:xx - MAC address of the host that is being registered - or deregistered from the HiperSockets channel. Not reported if the - event reports the creation or destruction of a VLAN. - -NTOK_BUSID=x.y.zzzz - device bus ID (CSSID, SSID and device number). - -NTOK_IID=xx - device IID. - -NTOK_CHPID=xx - device CHPID. - -NTOK_CHID=xxxx - device channel ID. - -Note that the NTOK_* attributes refer to devices other than the one -connected to the system on which the OS is running. diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst new file mode 100644 index 000000000000..ec2a1faa414b --- /dev/null +++ b/Documentation/s390/s390dbf.rst @@ -0,0 +1,803 @@ +================== +S390 Debug Feature +================== + +files: + - arch/s390/kernel/debug.c + - arch/s390/include/asm/debug.h + +Description: +------------ +The goal of this feature is to provide a kernel debug logging API +where log records can be stored efficiently in memory, where each component +(e.g. device drivers) can have one separate debug log. +One purpose of this is to inspect the debug logs after a production system crash +in order to analyze the reason for the crash. + +If the system still runs but only a subcomponent which uses dbf fails, +it is possible to look at the debug logs on a live system via the Linux +debugfs filesystem. + +The debug feature may also very useful for kernel and driver development. + +Design: +------- +Kernel components (e.g. device drivers) can register themselves at the debug +feature with the function call debug_register(). This function initializes a +debug log for the caller. For each debug log exists a number of debug areas +where exactly one is active at one time. Each debug area consists of contiguous +pages in memory. In the debug areas there are stored debug entries (log records) +which are written by event- and exception-calls. + +An event-call writes the specified debug entry to the active debug +area and updates the log pointer for the active area. If the end +of the active debug area is reached, a wrap around is done (ring buffer) +and the next debug entry will be written at the beginning of the active +debug area. + +An exception-call writes the specified debug entry to the log and +switches to the next debug area. This is done in order to be sure +that the records which describe the origin of the exception are not +overwritten when a wrap around for the current area occurs. + +The debug areas themselves are also ordered in form of a ring buffer. +When an exception is thrown in the last debug area, the following debug +entries are then written again in the very first area. + +There are three versions for the event- and exception-calls: One for +logging raw data, one for text and one for numbers. + +Each debug entry contains the following data: + +- Timestamp +- Cpu-Number of calling task +- Level of debug entry (0...6) +- Return Address to caller +- Flag, if entry is an exception or not + +The debug logs can be inspected in a live system through entries in +the debugfs-filesystem. Under the toplevel directory "s390dbf" there is +a directory for each registered component, which is named like the +corresponding component. The debugfs normally should be mounted to +/sys/kernel/debug therefore the debug feature can be accessed under +/sys/kernel/debug/s390dbf. + +The content of the directories are files which represent different views +to the debug log. Each component can decide which views should be +used through registering them with the function debug_register_view(). +Predefined views for hex/ascii, sprintf and raw binary data are provided. +It is also possible to define other views. The content of +a view can be inspected simply by reading the corresponding debugfs file. + +All debug logs have an actual debug level (range from 0 to 6). +The default level is 3. Event and Exception functions have a 'level' +parameter. Only debug entries with a level that is lower or equal +than the actual level are written to the log. This means, when +writing events, high priority log entries should have a low level +value whereas low priority entries should have a high one. +The actual debug level can be changed with the help of the debugfs-filesystem +through writing a number string "x" to the 'level' debugfs file which is +provided for every debug log. Debugging can be switched off completely +by using "-" on the 'level' debugfs file. + +Example:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/level + +It is also possible to deactivate the debug feature globally for every +debug log. You can change the behavior using 2 sysctl parameters in +/proc/sys/s390dbf: + +There are currently 2 possible triggers, which stop the debug feature +globally. The first possibility is to use the "debug_active" sysctl. If +set to 1 the debug feature is running. If "debug_active" is set to 0 the +debug feature is turned off. + +The second trigger which stops the debug feature is a kernel oops. +That prevents the debug feature from overwriting debug information that +happened before the oops. After an oops you can reactivate the debug feature +by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not +suggested to use an oopsed kernel in a production environment. + +If you want to disallow the deactivation of the debug feature, you can use +the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug +feature cannot be stopped. If the debug feature is already stopped, it +will stay deactivated. + +---------------------------------------------------------------------------- + +Kernel Interfaces: +------------------ + +:: + + debug_info_t *debug_register(char *name, int pages, int nr_areas, + int buf_size); + +Parameter: + name: + Name of debug log (e.g. used for debugfs entry) + pages: + Number of pages, which will be allocated per area + nr_areas: + Number of debug areas + buf_size: + Size of data area in each debug entry + +Return Value: + Handle for generated debug area + + NULL if register failed + +Description: Allocates memory for a debug log + Must not be called within an interrupt handler + +---------------------------------------------------------------------------- + +:: + + debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, + int buf_size, mode_t mode, uid_t uid, + gid_t gid); + +Parameter: + name: + Name of debug log (e.g. used for debugfs entry) + pages: + Number of pages, which will be allocated per area + nr_areas: + Number of debug areas + buf_size: + Size of data area in each debug entry + mode: + File mode for debugfs files. E.g. S_IRWXUGO + uid: + User ID for debugfs files. Currently only 0 is + supported. + gid: + Group ID for debugfs files. Currently only 0 is + supported. + +Return Value: + Handle for generated debug area + + NULL if register failed + +Description: + Allocates memory for a debug log + Must not be called within an interrupt handler + +--------------------------------------------------------------------------- + +:: + + void debug_unregister (debug_info_t * id); + +Parameter: + id: + handle for debug log + +Return Value: + none + +Description: + frees memory for a debug log and removes all registered debug + views. + + Must not be called within an interrupt handler + +--------------------------------------------------------------------------- + +:: + + void debug_set_level (debug_info_t * id, int new_level); + +Parameter: id: handle for debug log + new_level: new debug level + +Return Value: + none + +Description: + Sets new actual debug level if new_level is valid. + +--------------------------------------------------------------------------- + +:: + + bool debug_level_enabled (debug_info_t * id, int level); + +Parameter: + id: + handle for debug log + level: + debug level + +Return Value: + True if level is less or equal to the current debug level. + +Description: + Returns true if debug events for the specified level would be + logged. Otherwise returns false. + +--------------------------------------------------------------------------- + +:: + + void debug_stop_all(void); + +Parameter: + none + +Return Value: + none + +Description: + stops the debug feature if stopping is allowed. Currently + used in case of a kernel oops. + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_event (debug_info_t* id, int level, void* data, + int length); + +Parameter: + id: + handle for debug log + level: + debug level + data: + pointer to data for debug entry + length: + length of data in bytes + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_int_event (debug_info_t * id, int level, + unsigned int data); + debug_entry_t* debug_long_event(debug_info_t * id, int level, + unsigned long data); + +Parameter: + id: + handle for debug log + level: + debug level + data: + integer value for debug entry + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_text_event (debug_info_t * id, int level, + const char* data); + +Parameter: + id: + handle for debug log + level: + debug level + data: + string for debug entry + +Return Value: + Address of written debug entry + +Description: + writes debug entry in ascii format to active debug area + (if level <= actual debug level) + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, + char* string,...); + +Parameter: + id: + handle for debug log + level: + debug level + string: + format string for debug entry + ...: + varargs used as in sprintf() + +Return Value: Address of written debug entry + +Description: + writes debug entry with format string and varargs (longs) to + active debug area (if level $<=$ actual debug level). + floats and long long datatypes cannot be used as varargs. + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, + int length); + +Parameter: + id: + handle for debug log + level: + debug level + data: + pointer to data for debug entry + length: + length of data in bytes + +Return Value: + Address of written debug entry + +Description: + writes debug entry to active debug area (if level <= actual + debug level) and switches to next debug area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_int_exception (debug_info_t * id, int level, + unsigned int data); + debug_entry_t* debug_long_exception(debug_info_t * id, int level, + unsigned long data); + +Parameter: id: handle for debug log + level: debug level + data: integer value for debug entry + +Return Value: Address of written debug entry + +Description: writes debug entry to active debug area (if level <= actual + debug level) and switches to next debug area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_text_exception (debug_info_t * id, int level, + const char* data); + +Parameter: id: handle for debug log + level: debug level + data: string for debug entry + +Return Value: Address of written debug entry + +Description: writes debug entry in ascii format to active debug area + (if level <= actual debug level) and switches to next debug + area + +--------------------------------------------------------------------------- + +:: + + debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, + char* string,...); + +Parameter: id: handle for debug log + level: debug level + string: format string for debug entry + ...: varargs used as in sprintf() + +Return Value: Address of written debug entry + +Description: writes debug entry with format string and varargs (longs) to + active debug area (if level $<=$ actual debug level) and + switches to next debug area. + floats and long long datatypes cannot be used as varargs. + +--------------------------------------------------------------------------- + +:: + + int debug_register_view (debug_info_t * id, struct debug_view *view); + +Parameter: id: handle for debug log + view: pointer to debug view struct + +Return Value: 0 : ok + < 0: Error + +Description: registers new debug view and creates debugfs dir entry + +--------------------------------------------------------------------------- + +:: + + int debug_unregister_view (debug_info_t * id, struct debug_view *view); + +Parameter: id: handle for debug log + view: pointer to debug view struct + +Return Value: 0 : ok + < 0: Error + +Description: unregisters debug view and removes debugfs dir entry + + + +Predefined views: +----------------- + +extern struct debug_view debug_hex_ascii_view; + +extern struct debug_view debug_raw_view; + +extern struct debug_view debug_sprintf_view; + +Examples +-------- + +:: + + /* + * hex_ascii- + raw-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and 4 byte data field */ + + debug_info = debug_register ("test", 1, 4, 4 ); + debug_register_view(debug_info,&debug_hex_ascii_view); + debug_register_view(debug_info,&debug_raw_view); + + debug_text_event(debug_info, 4 , "one "); + debug_int_exception(debug_info, 4, 4711); + debug_event(debug_info, 3, &debug_info, 4); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +--------------------------------------------------------------------------- + +:: + + /* + * sprintf-view Example + */ + + #include + #include + + static debug_info_t* debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and data field for */ + /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ + + debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); + debug_register_view(debug_info,&debug_sprintf_view); + + debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); + debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); + + return 0; + } + + static void cleanup(void) + { + debug_unregister (debug_info); + } + + module_init(init); + module_exit(cleanup); + +Debugfs Interface +----------------- +Views to the debug logs can be investigated through reading the corresponding +debugfs-files: + +Example:: + + > ls /sys/kernel/debug/s390dbf/dasd + flush hex_ascii level pages raw + > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s + 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... + 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE + 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... + 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP + 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD + 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... + 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... + 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... + 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE + 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... + +See section about predefined views for explanation of the above output! + +Changing the debug level +------------------------ + +Example:: + + + > cat /sys/kernel/debug/s390dbf/dasd/level + 3 + > echo "5" > /sys/kernel/debug/s390dbf/dasd/level + > cat /sys/kernel/debug/s390dbf/dasd/level + 5 + +Flushing debug areas +-------------------- +Debug areas can be flushed with piping the number of the desired +area (0...n) to the debugfs file "flush". When using "-" all debug areas +are flushed. + +Examples: + +1. Flush debug area 0:: + + > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush + +2. Flush all debug areas:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush + +Changing the size of debug areas +------------------------------------ +It is possible the change the size of debug areas through piping +the number of pages to the debugfs file "pages". The resize request will +also flush the debug areas. + +Example: + +Define 4 pages for the debug areas of debug feature "dasd":: + + > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages + +Stooping the debug feature +-------------------------- +Example: + +1. Check if stopping is allowed:: + + > cat /proc/sys/s390dbf/debug_stoppable + +2. Stop debug feature:: + + > echo 0 > /proc/sys/s390dbf/debug_active + +lcrash Interface +---------------- +It is planned that the dump analysis tool lcrash gets an additional command +'s390dbf' to display all the debug logs. With this tool it will be possible +to investigate the debug logs on a live system and with a memory dump after +a system crash. + +Investigating raw memory +------------------------ +One last possibility to investigate the debug logs at a live +system and after a system crash is to look at the raw memory +under VM or at the Service Element. +It is possible to find the anker of the debug-logs through +the 'debug_area_first' symbol in the System map. Then one has +to follow the correct pointers of the data-structures defined +in debug.h and find the debug-areas in memory. +Normally modules which use the debug feature will also have +a global variable with the pointer to the debug-logs. Following +this pointer it will also be possible to find the debug logs in +memory. + +For this method it is recommended to use '16 * x + 4' byte (x = 0..n) +for the length of the data field in debug_register() in +order to see the debug entries well formatted. + + +Predefined Views +---------------- + +There are three predefined views: hex_ascii, raw and sprintf. +The hex_ascii view shows the data field in hex and ascii representation +(e.g. '45 43 4b 44 | ECKD'). +The raw view returns a bytestream as the debug areas are stored in memory. + +The sprintf view formats the debug entries in the same way as the sprintf +function would do. The sprintf event/exception functions write to the +debug entry a pointer to the format string (size = sizeof(long)) +and for each vararg a long value. So e.g. for a debug entry with a format +string plus two varargs one would need to allocate a (3 * sizeof(long)) +byte data area in the debug_register() function. + +IMPORTANT: + Using "%s" in sprintf event functions is dangerous. You can only + use "%s" in the sprintf event functions, if the memory for the passed string + is available as long as the debug feature exists. The reason behind this is + that due to performance considerations only a pointer to the string is stored + in the debug feature. If you log a string that is freed afterwards, you will + get an OOPS when inspecting the debug feature, because then the debug feature + will access the already freed memory. + +NOTE: + If using the sprintf view do NOT use other event/exception functions + than the sprintf-event and -exception functions. + +The format of the hex_ascii and sprintf view is as follows: + +- Number of area +- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated + Universal Time (UTC), January 1, 1970) +- level of debug entry +- Exception flag (* = Exception) +- Cpu-Number of calling task +- Return Address to caller +- data field + +The format of the raw view is: + +- Header as described in debug.h +- datafield + +A typical line of the hex_ascii view will look like the following (first line +is only for explanation and will not be displayed when 'cating' the view): + +area time level exception cpu caller data (hex + ascii) +-------------------------------------------------------------------------- +00 00964419409:440690 1 - 00 88023fe + + +Defining views +-------------- + +Views are specified with the 'debug_view' structure. There are defined +callback functions which are used for reading and writing the debugfs files:: + + struct debug_view { + char name[DEBUG_MAX_PROCF_LEN]; + debug_prolog_proc_t* prolog_proc; + debug_header_proc_t* header_proc; + debug_format_proc_t* format_proc; + debug_input_proc_t* input_proc; + void* private_data; + }; + +where:: + + typedef int (debug_header_proc_t) (debug_info_t* id, + struct debug_view* view, + int area, + debug_entry_t* entry, + char* out_buf); + + typedef int (debug_format_proc_t) (debug_info_t* id, + struct debug_view* view, char* out_buf, + const char* in_buf); + typedef int (debug_prolog_proc_t) (debug_info_t* id, + struct debug_view* view, + char* out_buf); + typedef int (debug_input_proc_t) (debug_info_t* id, + struct debug_view* view, + struct file* file, const char* user_buf, + size_t in_buf_size, loff_t* offset); + + +The "private_data" member can be used as pointer to view specific data. +It is not used by the debug feature itself. + +The output when reading a debugfs file is structured like this:: + + "prolog_proc output" + + "header_proc output 1" "format_proc output 1" + "header_proc output 2" "format_proc output 2" + "header_proc output 3" "format_proc output 3" + ... + +When a view is read from the debugfs, the Debug Feature calls the +'prolog_proc' once for writing the prolog. +Then 'header_proc' and 'format_proc' are called for each +existing debug entry. + +The input_proc can be used to implement functionality when it is written to +the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). + +For header_proc there can be used the default function +debug_dflt_header_fn() which is defined in debug.h. +and which produces the same header output as the predefined views. +E.g:: + + 00 00964419409:440761 2 - 00 88023ec + +In order to see how to use the callback functions check the implementation +of the default views! + +Example:: + + #include + + #define UNKNOWNSTR "data: %08x" + + const char* messages[] = + {"This error...........\n", + "That error...........\n", + "Problem..............\n", + "Something went wrong.\n", + "Everything ok........\n", + NULL + }; + + static int debug_test_format_fn( + debug_info_t * id, struct debug_view *view, + char *out_buf, const char *in_buf + ) + { + int i, rc = 0; + + if(id->buf_size >= 4) { + int msg_nr = *((int*)in_buf); + if(msg_nr < sizeof(messages)/sizeof(char*) - 1) + rc += sprintf(out_buf, "%s", messages[msg_nr]); + else + rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); + } + out: + return rc; + } + + struct debug_view debug_test_view = { + "myview", /* name of view */ + NULL, /* no prolog */ + &debug_dflt_header_fn, /* default header for each entry */ + &debug_test_format_fn, /* our own format function */ + NULL, /* no input function */ + NULL /* no private data */ + }; + +test: +===== + +:: + + debug_info_t *debug_info; + ... + debug_info = debug_register ("test", 0, 4, 4 )); + debug_register_view(debug_info, &debug_test_view); + for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i); + + > cat /sys/kernel/debug/s390dbf/test/myview + 00 00964419734:611402 1 - 00 88042ca This error........... + 00 00964419734:611405 1 - 00 88042ca That error........... + 00 00964419734:611408 1 - 00 88042ca Problem.............. + 00 00964419734:611411 1 - 00 88042ca Something went wrong. + 00 00964419734:611414 1 - 00 88042ca Everything ok........ + 00 00964419734:611417 1 - 00 88042ca data: 00000005 + 00 00964419734:611419 1 - 00 88042ca data: 00000006 + 00 00964419734:611422 1 - 00 88042ca data: 00000007 + 00 00964419734:611425 1 - 00 88042ca data: 00000008 + 00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt deleted file mode 100644 index 61329fd62e89..000000000000 --- a/Documentation/s390/s390dbf.txt +++ /dev/null @@ -1,667 +0,0 @@ -S390 Debug Feature -================== - -files: arch/s390/kernel/debug.c - arch/s390/include/asm/debug.h - -Description: ------------- -The goal of this feature is to provide a kernel debug logging API -where log records can be stored efficiently in memory, where each component -(e.g. device drivers) can have one separate debug log. -One purpose of this is to inspect the debug logs after a production system crash -in order to analyze the reason for the crash. -If the system still runs but only a subcomponent which uses dbf fails, -it is possible to look at the debug logs on a live system via the Linux -debugfs filesystem. -The debug feature may also very useful for kernel and driver development. - -Design: -------- -Kernel components (e.g. device drivers) can register themselves at the debug -feature with the function call debug_register(). This function initializes a -debug log for the caller. For each debug log exists a number of debug areas -where exactly one is active at one time. Each debug area consists of contiguous -pages in memory. In the debug areas there are stored debug entries (log records) -which are written by event- and exception-calls. - -An event-call writes the specified debug entry to the active debug -area and updates the log pointer for the active area. If the end -of the active debug area is reached, a wrap around is done (ring buffer) -and the next debug entry will be written at the beginning of the active -debug area. - -An exception-call writes the specified debug entry to the log and -switches to the next debug area. This is done in order to be sure -that the records which describe the origin of the exception are not -overwritten when a wrap around for the current area occurs. - -The debug areas themselves are also ordered in form of a ring buffer. -When an exception is thrown in the last debug area, the following debug -entries are then written again in the very first area. - -There are three versions for the event- and exception-calls: One for -logging raw data, one for text and one for numbers. - -Each debug entry contains the following data: - -- Timestamp -- Cpu-Number of calling task -- Level of debug entry (0...6) -- Return Address to caller -- Flag, if entry is an exception or not - -The debug logs can be inspected in a live system through entries in -the debugfs-filesystem. Under the toplevel directory "s390dbf" there is -a directory for each registered component, which is named like the -corresponding component. The debugfs normally should be mounted to -/sys/kernel/debug therefore the debug feature can be accessed under -/sys/kernel/debug/s390dbf. - -The content of the directories are files which represent different views -to the debug log. Each component can decide which views should be -used through registering them with the function debug_register_view(). -Predefined views for hex/ascii, sprintf and raw binary data are provided. -It is also possible to define other views. The content of -a view can be inspected simply by reading the corresponding debugfs file. - -All debug logs have an actual debug level (range from 0 to 6). -The default level is 3. Event and Exception functions have a 'level' -parameter. Only debug entries with a level that is lower or equal -than the actual level are written to the log. This means, when -writing events, high priority log entries should have a low level -value whereas low priority entries should have a high one. -The actual debug level can be changed with the help of the debugfs-filesystem -through writing a number string "x" to the 'level' debugfs file which is -provided for every debug log. Debugging can be switched off completely -by using "-" on the 'level' debugfs file. - -Example: - -> echo "-" > /sys/kernel/debug/s390dbf/dasd/level - -It is also possible to deactivate the debug feature globally for every -debug log. You can change the behavior using 2 sysctl parameters in -/proc/sys/s390dbf: -There are currently 2 possible triggers, which stop the debug feature -globally. The first possibility is to use the "debug_active" sysctl. If -set to 1 the debug feature is running. If "debug_active" is set to 0 the -debug feature is turned off. -The second trigger which stops the debug feature is a kernel oops. -That prevents the debug feature from overwriting debug information that -happened before the oops. After an oops you can reactivate the debug feature -by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not -suggested to use an oopsed kernel in a production environment. -If you want to disallow the deactivation of the debug feature, you can use -the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug -feature cannot be stopped. If the debug feature is already stopped, it -will stay deactivated. - -Kernel Interfaces: ------------------- - ----------------------------------------------------------------------------- -debug_info_t *debug_register(char *name, int pages, int nr_areas, - int buf_size); - -Parameter: name: Name of debug log (e.g. used for debugfs entry) - pages: number of pages, which will be allocated per area - nr_areas: number of debug areas - buf_size: size of data area in each debug entry - -Return Value: Handle for generated debug area - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ----------------------------------------------------------------------------- -debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, - int buf_size, mode_t mode, uid_t uid, - gid_t gid); - -Parameter: name: Name of debug log (e.g. used for debugfs entry) - pages: Number of pages, which will be allocated per area - nr_areas: Number of debug areas - buf_size: Size of data area in each debug entry - mode: File mode for debugfs files. E.g. S_IRWXUGO - uid: User ID for debugfs files. Currently only 0 is - supported. - gid: Group ID for debugfs files. Currently only 0 is - supported. - -Return Value: Handle for generated debug area - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- -void debug_unregister (debug_info_t * id); - -Parameter: id: handle for debug log - -Return Value: none - -Description: frees memory for a debug log and removes all registered debug - views. - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- -void debug_set_level (debug_info_t * id, int new_level); - -Parameter: id: handle for debug log - new_level: new debug level - -Return Value: none - -Description: Sets new actual debug level if new_level is valid. - ---------------------------------------------------------------------------- -bool debug_level_enabled (debug_info_t * id, int level); - -Parameter: id: handle for debug log - level: debug level - -Return Value: True if level is less or equal to the current debug level. - -Description: Returns true if debug events for the specified level would be - logged. Otherwise returns false. ---------------------------------------------------------------------------- -void debug_stop_all(void); - -Parameter: none - -Return Value: none - -Description: stops the debug feature if stopping is allowed. Currently - used in case of a kernel oops. - ---------------------------------------------------------------------------- -debug_entry_t* debug_event (debug_info_t* id, int level, void* data, - int length); - -Parameter: id: handle for debug log - level: debug level - data: pointer to data for debug entry - length: length of data in bytes - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_int_event (debug_info_t * id, int level, - unsigned int data); -debug_entry_t* debug_long_event(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_text_event (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) - ---------------------------------------------------------------------------- -debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level). - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, - int length); - -Parameter: id: handle for debug log - level: debug level - data: pointer to data for debug entry - length: length of data in bytes - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- -debug_entry_t* debug_int_exception (debug_info_t * id, int level, - unsigned int data); -debug_entry_t* debug_long_exception(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- -debug_entry_t* debug_text_exception (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) and switches to next debug - area - ---------------------------------------------------------------------------- -debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level) and - switches to next debug area. - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -int debug_register_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: registers new debug view and creates debugfs dir entry - ---------------------------------------------------------------------------- -int debug_unregister_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: unregisters debug view and removes debugfs dir entry - - - -Predefined views: ------------------ - -extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_raw_view; -extern struct debug_view debug_sprintf_view; - -Examples --------- - -/* - * hex_ascii- + raw-view Example - */ - -#include -#include - -static debug_info_t* debug_info; - -static int init(void) -{ - /* register 4 debug areas with one page each and 4 byte data field */ - - debug_info = debug_register ("test", 1, 4, 4 ); - debug_register_view(debug_info,&debug_hex_ascii_view); - debug_register_view(debug_info,&debug_raw_view); - - debug_text_event(debug_info, 4 , "one "); - debug_int_exception(debug_info, 4, 4711); - debug_event(debug_info, 3, &debug_info, 4); - - return 0; -} - -static void cleanup(void) -{ - debug_unregister (debug_info); -} - -module_init(init); -module_exit(cleanup); - ---------------------------------------------------------------------------- - -/* - * sprintf-view Example - */ - -#include -#include - -static debug_info_t* debug_info; - -static int init(void) -{ - /* register 4 debug areas with one page each and data field for */ - /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - - debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info,&debug_sprintf_view); - - debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); - debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); - - return 0; -} - -static void cleanup(void) -{ - debug_unregister (debug_info); -} - -module_init(init); -module_exit(cleanup); - - - -Debugfs Interface ----------------- -Views to the debug logs can be investigated through reading the corresponding -debugfs-files: - -Example: - -> ls /sys/kernel/debug/s390dbf/dasd -flush hex_ascii level pages raw -> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s -00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... -00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE -00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... -00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP -01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD -01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... -01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... -01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... -01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE -01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... - -See section about predefined views for explanation of the above output! - -Changing the debug level ------------------------- - -Example: - - -> cat /sys/kernel/debug/s390dbf/dasd/level -3 -> echo "5" > /sys/kernel/debug/s390dbf/dasd/level -> cat /sys/kernel/debug/s390dbf/dasd/level -5 - -Flushing debug areas --------------------- -Debug areas can be flushed with piping the number of the desired -area (0...n) to the debugfs file "flush". When using "-" all debug areas -are flushed. - -Examples: - -1. Flush debug area 0: -> echo "0" > /sys/kernel/debug/s390dbf/dasd/flush - -2. Flush all debug areas: -> echo "-" > /sys/kernel/debug/s390dbf/dasd/flush - -Changing the size of debug areas ------------------------------------- -It is possible the change the size of debug areas through piping -the number of pages to the debugfs file "pages". The resize request will -also flush the debug areas. - -Example: - -Define 4 pages for the debug areas of debug feature "dasd": -> echo "4" > /sys/kernel/debug/s390dbf/dasd/pages - -Stooping the debug feature --------------------------- -Example: - -1. Check if stopping is allowed -> cat /proc/sys/s390dbf/debug_stoppable -2. Stop debug feature -> echo 0 > /proc/sys/s390dbf/debug_active - -lcrash Interface ----------------- -It is planned that the dump analysis tool lcrash gets an additional command -'s390dbf' to display all the debug logs. With this tool it will be possible -to investigate the debug logs on a live system and with a memory dump after -a system crash. - -Investigating raw memory ------------------------- -One last possibility to investigate the debug logs at a live -system and after a system crash is to look at the raw memory -under VM or at the Service Element. -It is possible to find the anker of the debug-logs through -the 'debug_area_first' symbol in the System map. Then one has -to follow the correct pointers of the data-structures defined -in debug.h and find the debug-areas in memory. -Normally modules which use the debug feature will also have -a global variable with the pointer to the debug-logs. Following -this pointer it will also be possible to find the debug logs in -memory. - -For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in debug_register() in -order to see the debug entries well formatted. - - -Predefined Views ----------------- - -There are three predefined views: hex_ascii, raw and sprintf. -The hex_ascii view shows the data field in hex and ascii representation -(e.g. '45 43 4b 44 | ECKD'). -The raw view returns a bytestream as the debug areas are stored in memory. - -The sprintf view formats the debug entries in the same way as the sprintf -function would do. The sprintf event/exception functions write to the -debug entry a pointer to the format string (size = sizeof(long)) -and for each vararg a long value. So e.g. for a debug entry with a format -string plus two varargs one would need to allocate a (3 * sizeof(long)) -byte data area in the debug_register() function. - -IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only -use "%s" in the sprintf event functions, if the memory for the passed string is -available as long as the debug feature exists. The reason behind this is that -due to performance considerations only a pointer to the string is stored in -the debug feature. If you log a string that is freed afterwards, you will get -an OOPS when inspecting the debug feature, because then the debug feature will -access the already freed memory. - -NOTE: If using the sprintf view do NOT use other event/exception functions -than the sprintf-event and -exception functions. - -The format of the hex_ascii and sprintf view is as follows: -- Number of area -- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated - Universal Time (UTC), January 1, 1970) -- level of debug entry -- Exception flag (* = Exception) -- Cpu-Number of calling task -- Return Address to caller -- data field - -The format of the raw view is: -- Header as described in debug.h -- datafield - -A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view): - -area time level exception cpu caller data (hex + ascii) --------------------------------------------------------------------------- -00 00964419409:440690 1 - 00 88023fe - - -Defining views --------------- - -Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files: - -struct debug_view { - char name[DEBUG_MAX_PROCF_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; -}; - -where - -typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, - int area, - debug_entry_t* entry, - char* out_buf); - -typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); -typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); -typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, const char* user_buf, - size_t in_buf_size, loff_t* offset); - - -The "private_data" member can be used as pointer to view specific data. -It is not used by the debug feature itself. - -The output when reading a debugfs file is structured like this: - -"prolog_proc output" - -"header_proc output 1" "format_proc output 1" -"header_proc output 2" "format_proc output 2" -"header_proc output 3" "format_proc output 3" -... - -When a view is read from the debugfs, the Debug Feature calls the -'prolog_proc' once for writing the prolog. -Then 'header_proc' and 'format_proc' are called for each -existing debug entry. - -The input_proc can be used to implement functionality when it is written to -the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). - -For header_proc there can be used the default function -debug_dflt_header_fn() which is defined in debug.h. -and which produces the same header output as the predefined views. -E.g: -00 00964419409:440761 2 - 00 88023ec - -In order to see how to use the callback functions check the implementation -of the default views! - -Example - -#include - -#define UNKNOWNSTR "data: %08x" - -const char* messages[] = -{"This error...........\n", - "That error...........\n", - "Problem..............\n", - "Something went wrong.\n", - "Everything ok........\n", - NULL -}; - -static int debug_test_format_fn( - debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf -) -{ - int i, rc = 0; - - if(id->buf_size >= 4) { - int msg_nr = *((int*)in_buf); - if(msg_nr < sizeof(messages)/sizeof(char*) - 1) - rc += sprintf(out_buf, "%s", messages[msg_nr]); - else - rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); - } - out: - return rc; -} - -struct debug_view debug_test_view = { - "myview", /* name of view */ - NULL, /* no prolog */ - &debug_dflt_header_fn, /* default header for each entry */ - &debug_test_format_fn, /* our own format function */ - NULL, /* no input function */ - NULL /* no private data */ -}; - -===== -test: -===== -debug_info_t *debug_info; -... -debug_info = debug_register ("test", 0, 4, 4 )); -debug_register_view(debug_info, &debug_test_view); -for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i); - -> cat /sys/kernel/debug/s390dbf/test/myview -00 00964419734:611402 1 - 00 88042ca This error........... -00 00964419734:611405 1 - 00 88042ca That error........... -00 00964419734:611408 1 - 00 88042ca Problem.............. -00 00964419734:611411 1 - 00 88042ca Something went wrong. -00 00964419734:611414 1 - 00 88042ca Everything ok........ -00 00964419734:611417 1 - 00 88042ca data: 00000005 -00 00964419734:611419 1 - 00 88042ca data: 00000006 -00 00964419734:611422 1 - 00 88042ca data: 00000007 -00 00964419734:611425 1 - 00 88042ca data: 00000008 -00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/s390/text_files.rst b/Documentation/s390/text_files.rst new file mode 100644 index 000000000000..c94d05d4fa17 --- /dev/null +++ b/Documentation/s390/text_files.rst @@ -0,0 +1,11 @@ +ibm 3270 changelog +------------------ + +.. include:: 3270.ChangeLog + :literal: + +ibm 3270 config3270.sh +---------------------- + +.. literalinclude:: config3270.sh + :language: shell diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst new file mode 100644 index 000000000000..b5c51f7c748d --- /dev/null +++ b/Documentation/s390/vfio-ap.rst @@ -0,0 +1,866 @@ +=============================== +Adjunct Processor (AP) facility +=============================== + + +Introduction +============ +The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised +of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. +The AP devices provide cryptographic functions to all CPUs assigned to a +linux system running in an IBM Z system LPAR. + +The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap +is to make AP cards available to KVM guests using the VFIO mediated device +framework. This implementation relies considerably on the s390 virtualization +facilities which do most of the hard work of providing direct access to AP +devices. + +AP Architectural Overview +========================= +To facilitate the comprehension of the design, let's start with some +definitions: + +* AP adapter + + An AP adapter is an IBM Z adapter card that can perform cryptographic + functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters + assigned to the LPAR in which a linux host is running will be available to + the linux host. Each adapter is identified by a number from 0 to 255; however, + the maximum adapter number is determined by machine model and/or adapter type. + When installed, an AP adapter is accessed by AP instructions executed by any + CPU. + + The AP adapter cards are assigned to a given LPAR via the system's Activation + Profile which can be edited via the HMC. When the linux host system is IPL'd + in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and + creates a sysfs device for each assigned adapter. For example, if AP adapters + 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following + sysfs device entries:: + + /sys/devices/ap/card04 + /sys/devices/ap/card0a + + Symbolic links to these devices will also be created in the AP bus devices + sub-directory:: + + /sys/bus/ap/devices/[card04] + /sys/bus/ap/devices/[card04] + +* AP domain + + An adapter is partitioned into domains. An adapter can hold up to 256 domains + depending upon the adapter type and hardware configuration. A domain is + identified by a number from 0 to 255; however, the maximum domain number is + determined by machine model and/or adapter type.. A domain can be thought of + as a set of hardware registers and memory used for processing AP commands. A + domain can be configured with a secure private key used for clear key + encryption. A domain is classified in one of two ways depending upon how it + may be accessed: + + * Usage domains are domains that are targeted by an AP instruction to + process an AP command. + + * Control domains are domains that are changed by an AP command sent to a + usage domain; for example, to set the secure private key for the control + domain. + + The AP usage and control domains are assigned to a given LPAR via the system's + Activation Profile which can be edited via the HMC. When a linux host system + is IPL'd in the LPAR, the AP bus module detects the AP usage and control + domains assigned to the LPAR. The domain number of each usage domain and + adapter number of each AP adapter are combined to create AP queue devices + (see AP Queue section below). The domain number of each control domain will be + represented in a bitmask and stored in a sysfs file + /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least + significant bit, correspond to domains 0-255. + +* AP Queue + + An AP queue is the means by which an AP command is sent to a usage domain + inside a specific adapter. An AP queue is identified by a tuple + comprised of an AP adapter ID (APID) and an AP queue index (APQI). The + APQI corresponds to a given usage domain number within the adapter. This tuple + forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP + instructions include a field containing the APQN to identify the AP queue to + which the AP command is to be sent for processing. + + The AP bus will create a sysfs device for each APQN that can be derived from + the cross product of the AP adapter and usage domain numbers detected when the + AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage + domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the + following sysfs entries:: + + /sys/devices/ap/card04/04.0006 + /sys/devices/ap/card04/04.0047 + /sys/devices/ap/card0a/0a.0006 + /sys/devices/ap/card0a/0a.0047 + + The following symbolic links to these devices will be created in the AP bus + devices subdirectory:: + + /sys/bus/ap/devices/[04.0006] + /sys/bus/ap/devices/[04.0047] + /sys/bus/ap/devices/[0a.0006] + /sys/bus/ap/devices/[0a.0047] + +* AP Instructions: + + There are three AP instructions: + + * NQAP: to enqueue an AP command-request message to a queue + * DQAP: to dequeue an AP command-reply message from a queue + * PQAP: to administer the queues + + AP instructions identify the domain that is targeted to process the AP + command; this must be one of the usage domains. An AP command may modify a + domain that is not one of the usage domains, but the modified domain + must be one of the control domains. + +AP and SIE +========== +Let's now take a look at how AP instructions executed on a guest are interpreted +by the hardware. + +A satellite control block called the Crypto Control Block (CRYCB) is attached to +our main hardware virtualization control block. The CRYCB contains three fields +to identify the adapters, usage domains and control domains assigned to the KVM +guest: + +* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned + to the KVM guest. Each bit in the mask, from left to right (i.e. from most + significant to least significant bit in big endian order), corresponds to + an APID from 0-255. If a bit is set, the corresponding adapter is valid for + use by the KVM guest. + +* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains + assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from + most significant to least significant bit in big endian order), corresponds to + an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue + is valid for use by the KVM guest. + +* The AP Domain Mask field is a bit mask that identifies the AP control domains + assigned to the KVM guest. The ADM bit mask controls which domains can be + changed by an AP command-request message sent to a usage domain from the + guest. Each bit in the mask, from left to right (i.e. from most significant to + least significant bit in big endian order), corresponds to a domain from + 0-255. If a bit is set, the corresponding domain can be modified by an AP + command-request message sent to a usage domain. + +If you recall from the description of an AP Queue, AP instructions include +an APQN to identify the AP queue to which an AP command-request message is to be +sent (NQAP and PQAP instructions), or from which a command-reply message is to +be received (DQAP instruction). The validity of an APQN is defined by the matrix +calculated from the APM and AQM; it is the cross product of all assigned adapter +numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1 +and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6), +(2,5) and (2,6) will be valid for the guest. + +The APQNs can provide secure key functionality - i.e., a private key is stored +on the adapter card for each of its domains - so each APQN must be assigned to +at most one guest or to the linux host:: + + Example 1: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1,2 domain 7 + + This is valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (1,7), (2,7) + + Example 2: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapters 3,4 domains 5,6 + + This is also valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) + + Example 3: Invalid configuration: + -------------------------------- + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1 domains 6,7 + + This is an invalid configuration because both guests have access to + APQN (1,6). + +The Design +========== +The design introduces three new objects: + +1. AP matrix device +2. VFIO AP device driver (vfio_ap.ko) +3. VFIO AP mediated matrix pass-through device + +The VFIO AP device driver +------------------------- +The VFIO AP (vfio_ap) device driver serves the following purposes: + +1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. + +2. Sets up the VFIO mediated device interfaces to manage a mediated matrix + device and creates the sysfs interfaces for assigning adapters, usage + domains, and control domains comprising the matrix for a KVM guest. + +3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's + SIE state description to grant the guest access to a matrix of AP devices + +Reserve APQNs for exclusive use of KVM guests +--------------------------------------------- +The following block diagram illustrates the mechanism by which APQNs are +reserved:: + + +------------------+ + 7 remove | | + +--------------------> cex4queue driver | + | | | + | +------------------+ + | + | + | +------------------+ +----------------+ + | 5 register driver | | 3 create | | + | +----------------> Device core +----------> matrix device | + | | | | | | + | | +--------^---------+ +----------------+ + | | | + | | +-------------------+ + | | +-----------------------------------+ | + | | | 4 register AP driver | | 2 register device + | | | | | + +--------+---+-v---+ +--------+-------+-+ + | | | | + | ap_bus +--------------------- > vfio_ap driver | + | | 8 probe | | + +--------^---------+ +--^--^------------+ + 6 edit | | | + apmask | +-----------------------------+ | 9 mdev create + aqmask | | 1 modprobe | + +--------+-----+---+ +----------------+-+ +----------------+ + | | | |8 create | mediated | + | admin | | VFIO device core |---------> matrix | + | + | | | device | + +------+-+---------+ +--------^---------+ +--------^-------+ + | | | | + | | 9 create vfio_ap-passthrough | | + | +------------------------------+ | + +-------------------------------------------------------------+ + 10 assign adapter/domain/control domain + +The process for reserving an AP queue for use by a KVM guest is: + +1. The administrator loads the vfio_ap device driver +2. The vfio-ap driver during its initialization will register a single 'matrix' + device with the device core. This will serve as the parent device for + all mediated matrix devices used to configure an AP matrix for a guest. +3. The /sys/devices/vfio_ap/matrix device is created by the device core +4. The vfio_ap device driver will register with the AP bus for AP queue devices + of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap + driver's probe and remove callback interfaces. Devices older than CEX4 queues + are not supported to simplify the implementation by not needlessly + complicating the design by supporting older devices that will go out of + service in the relatively near future, and for which there are few older + systems around on which to test. +5. The AP bus registers the vfio_ap device driver with the device core +6. The administrator edits the AP adapter and queue masks to reserve AP queues + for use by the vfio_ap device driver. +7. The AP bus removes the AP queues reserved for the vfio_ap driver from the + default zcrypt cex4queue driver. +8. The AP bus probes the vfio_ap device driver to bind the queues reserved for + it. +9. The administrator creates a passthrough type mediated matrix device to be + used by a guest +10. The administrator assigns the adapters, usage domains and control domains + to be exclusively used by a guest. + +Set up the VFIO mediated device interfaces +------------------------------------------ +The VFIO AP device driver utilizes the common interface of the VFIO mediated +device core driver to: + +* Register an AP mediated bus driver to add a mediated matrix device to and + remove it from a VFIO group. +* Create and destroy a mediated matrix device +* Add a mediated matrix device to and remove it from the AP mediated bus driver +* Add a mediated matrix device to and remove it from an IOMMU group + +The following high-level block diagram shows the main components and interfaces +of the VFIO AP mediated matrix device driver:: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ap.ko |<-> matrix + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +During initialization of the vfio_ap module, the matrix device is registered +with an 'mdev_parent_ops' structure that provides the sysfs attribute +structures, mdev functions and callback interfaces for managing the mediated +matrix device. + +* sysfs attribute structures: + + supported_type_groups + The VFIO mediated device framework supports creation of user-defined + mediated device types. These mediated device types are specified + via the 'supported_type_groups' structure when a device is registered + with the mediated device framework. The registration process creates the + sysfs structures for each mediated device type specified in the + 'mdev_supported_types' sub-directory of the device being registered. Along + with the device type, the sysfs attributes of the mediated device type are + provided. + + The VFIO AP device driver will register one mediated device type for + passthrough devices: + + /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough + + Only the read-only attributes required by the VFIO mdev framework will + be provided:: + + ... name + ... device_api + ... available_instances + ... device_api + + Where: + + * name: + specifies the name of the mediated device type + * device_api: + the mediated device type's API + * available_instances: + the number of mediated matrix passthrough devices + that can be created + * device_api: + specifies the VFIO API + mdev_attr_groups + This attribute group identifies the user-defined sysfs attributes of the + mediated device. When a device is registered with the VFIO mediated device + framework, the sysfs attribute files identified in the 'mdev_attr_groups' + structure will be created in the mediated matrix device's directory. The + sysfs attributes for a mediated matrix device are: + + assign_adapter / unassign_adapter: + Write-only attributes for assigning/unassigning an AP adapter to/from the + mediated matrix device. To assign/unassign an adapter, the APID of the + adapter is echoed to the respective attribute file. + assign_domain / unassign_domain: + Write-only attributes for assigning/unassigning an AP usage domain to/from + the mediated matrix device. To assign/unassign a domain, the domain + number of the the usage domain is echoed to the respective attribute + file. + matrix: + A read-only file for displaying the APQNs derived from the cross product + of the adapter and domain numbers assigned to the mediated matrix device. + assign_control_domain / unassign_control_domain: + Write-only attributes for assigning/unassigning an AP control domain + to/from the mediated matrix device. To assign/unassign a control domain, + the ID of the domain to be assigned/unassigned is echoed to the respective + attribute file. + control_domains: + A read-only file for displaying the control domain numbers assigned to the + mediated matrix device. + +* functions: + + create: + allocates the ap_matrix_mdev structure used by the vfio_ap driver to: + + * Store the reference to the KVM structure for the guest using the mdev + * Store the AP matrix configuration for the adapters, domains, and control + domains assigned via the corresponding sysfs attributes files + + remove: + deallocates the mediated matrix device's ap_matrix_mdev structure. This will + be allowed only if a running guest is not using the mdev. + +* callback interfaces + + open: + The vfio_ap driver uses this callback to register a + VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix + device. The open is invoked when QEMU connects the VFIO iommu group + for the mdev matrix device to the MDEV bus. Access to the KVM structure used + to configure the KVM guest is provided via this callback. The KVM structure, + is used to configure the guest's access to the AP matrix defined via the + mediated matrix device's sysfs attribute files. + release: + unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the + mdev matrix device and deconfigures the guest's AP matrix. + +Configure the APM, AQM and ADM in the CRYCB +------------------------------------------- +Configuring the AP matrix for a KVM guest will be performed when the +VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier +function is called when QEMU connects to KVM. The guest's AP matrix is +configured via it's CRYCB by: + +* Setting the bits in the APM corresponding to the APIDs assigned to the + mediated matrix device via its 'assign_adapter' interface. +* Setting the bits in the AQM corresponding to the domains assigned to the + mediated matrix device via its 'assign_domain' interface. +* Setting the bits in the ADM corresponding to the domain dIDs assigned to the + mediated matrix device via its 'assign_control_domains' interface. + +The CPU model features for AP +----------------------------- +The AP stack relies on the presence of the AP instructions as well as two +facilities: The AP Facilities Test (APFT) facility; and the AP Query +Configuration Information (QCI) facility. These features/facilities are made +available to a KVM guest via the following CPU model features: + +1. ap: Indicates whether the AP instructions are installed on the guest. This + feature will be enabled by KVM only if the AP instructions are installed + on the host. + +2. apft: Indicates the APFT facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 15 is set). + +3. apqci: Indicates the AP QCI facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 12 is set). + +Note: If the user chooses to specify a CPU model different than the 'host' +model to QEMU, the CPU model features and facilities need to be turned on +explicitly; for example:: + + /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on + +A guest can be precluded from using AP features/facilities by turning them off +explicitly; for example:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off + +Note: If the APFT facility is turned off (apft=off) for the guest, the guest +will not see any AP devices. The zcrypt device drivers that register for type 10 +and newer AP devices - i.e., the cex4card and cex4queue device drivers - need +the APFT facility to ascertain the facilities installed on a given AP device. If +the APFT facility is not installed on the guest, then the probe of device +drivers will fail since only type 10 and newer devices can be configured for +guest use. + +Example +======= +Let's now provide an example to illustrate how KVM guests may be given +access to AP facilities. For this example, we will show how to configure +three guests such that executing the lszcrypt command on the guests would +look like this: + +Guest1 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +05 CEX5C CCA-Coproc +05.0004 CEX5C CCA-Coproc +05.00ab CEX5C CCA-Coproc +06 CEX5A Accelerator +06.0004 CEX5A Accelerator +06.00ab CEX5C CCA-Coproc +=========== ===== ============ + +Guest2 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +05 CEX5A Accelerator +05.0047 CEX5A Accelerator +05.00ff CEX5A Accelerator +=========== ===== ============ + +Guest2 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +06 CEX5A Accelerator +06.0047 CEX5A Accelerator +06.00ff CEX5A Accelerator +=========== ===== ============ + +These are the steps: + +1. Install the vfio_ap module on the linux host. The dependency chain for the + vfio_ap module is: + * iommu + * s390 + * zcrypt + * vfio + * vfio_mdev + * vfio_mdev_device + * KVM + + To build the vfio_ap module, the kernel build must be configured with the + following Kconfig elements selected: + * IOMMU_SUPPORT + * S390 + * ZCRYPT + * S390_AP_IOMMU + * VFIO + * VFIO_MDEV + * VFIO_MDEV_DEVICE + * KVM + + If using make menuconfig select the following to build the vfio_ap module:: + + -> Device Drivers + -> IOMMU Hardware Support + select S390 AP IOMMU Support + -> VFIO Non-Privileged userspace driver framework + -> Mediated device driver frramework + -> VFIO driver for Mediated devices + -> I/O subsystem + -> VFIO support for AP devices + +2. Secure the AP queues to be used by the three guests so that the host can not + access them. To secure them, there are two sysfs files that specify + bitmasks marking a subset of the APQN range as 'usable by the default AP + queue device drivers' or 'not usable by the default device drivers' and thus + available for use by the vfio_ap device driver'. The location of the sysfs + files containing the masks are:: + + /sys/bus/ap/apmask + /sys/bus/ap/aqmask + + The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs + (APID). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APID from + 0-255. If a bit is set, the APID is marked as usable only by the default AP + queue device drivers; otherwise, the APID is usable by the vfio_ap + device driver. + + The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes + (APQI). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APQI from + 0-255. If a bit is set, the APQI is marked as usable only by the default AP + queue device drivers; otherwise, the APQI is usable by the vfio_ap device + driver. + + Take, for example, the following mask:: + + 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + + It indicates: + + 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6 + belong to the vfio_ap device driver's pool. + + The APQN of each AP queue device assigned to the linux host is checked by the + AP bus against the set of APQNs derived from the cross product of APIDs + and APQIs marked as usable only by the default AP queue device drivers. If a + match is detected, only the default AP queue device drivers will be probed; + otherwise, the vfio_ap device driver will be probed. + + By default, the two masks are set to reserve all APQNs for use by the default + AP queue device drivers. There are two ways the default masks can be changed: + + 1. The sysfs mask files can be edited by echoing a string into the + respective sysfs mask file in one of two formats: + + * An absolute hex string starting with 0x - like "0x12345678" - sets + the mask. If the given string is shorter than the mask, it is padded + with 0s on the right; for example, specifying a mask value of 0x41 is + the same as specifying:: + + 0x4100000000000000000000000000000000000000000000000000000000000000 + + Keep in mind that the mask reads from left to right (i.e., most + significant to least significant bit in big endian order), so the mask + above identifies device numbers 1 and 7 (01000001). + + If the string is longer than the mask, the operation is terminated with + an error (EINVAL). + + * Individual bits in the mask can be switched on and off by specifying + each bit number to be switched in a comma separated list. Each bit + number string must be prepended with a ('+') or minus ('-') to indicate + the corresponding bit is to be switched on ('+') or off ('-'). Some + valid values are: + + - "+0" switches bit 0 on + - "-13" switches bit 13 off + - "+0x41" switches bit 65 on + - "-0xff" switches bit 255 off + + The following example: + + +0,-6,+0x47,-0xf0 + + Switches bits 0 and 71 (0x47) on + + Switches bits 6 and 240 (0xf0) off + + Note that the bits not specified in the list remain as they were before + the operation. + + 2. The masks can also be changed at boot time via parameters on the kernel + command line like this: + + ap.apmask=0xffff ap.aqmask=0x40 + + This would create the following masks:: + + apmask: + 0xffff000000000000000000000000000000000000000000000000000000000000 + + aqmask: + 0x4000000000000000000000000000000000000000000000000000000000000000 + + Resulting in these two pools:: + + default drivers pool: adapter 0-15, domain 1 + alternate drivers pool: adapter 16-255, domains 0, 2-255 + +Securing the APQNs for our example +---------------------------------- + To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, + 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding + APQNs can either be removed from the default masks:: + + echo -5,-6 > /sys/bus/ap/apmask + + echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask + + Or the masks can be set as follows:: + + echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ + > apmask + + echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ + > aqmask + + This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, + 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The + sysfs directory for the vfio_ap device driver will now contain symbolic links + to the AP queue devices bound to it:: + + /sys/bus/ap + ... [drivers] + ...... [vfio_ap] + ......... [05.0004] + ......... [05.0047] + ......... [05.00ab] + ......... [05.00ff] + ......... [06.0004] + ......... [06.0047] + ......... [06.00ab] + ......... [06.00ff] + + Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) + can be bound to the vfio_ap device driver. The reason for this is to + simplify the implementation by not needlessly complicating the design by + supporting older devices that will go out of service in the relatively near + future and for which there are few older systems on which to test. + + The administrator, therefore, must take care to secure only AP queues that + can be bound to the vfio_ap device driver. The device type for a given AP + queue device can be read from the parent card's sysfs directory. For example, + to see the hardware type of the queue 05.0004: + + cat /sys/bus/ap/devices/card05/hwtype + + The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the + vfio_ap device driver. + +3. Create the mediated devices needed to configure the AP matrixes for the + three guests and to provide an interface to the vfio_ap driver for + use by the guests:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) + --------- create + --------- [devices] + + To create the mediated devices for the three guests:: + + uuidgen > create + uuidgen > create + uuidgen > create + + or + + echo $uuid1 > create + echo $uuid2 > create + echo $uuid3 > create + + This will create three mediated devices in the [devices] subdirectory named + after the UUID written to the create attribute file. We call them $uuid1, + $uuid2 and $uuid3 and this is the sysfs directory structure after creation:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + --------------- unassign_control_domain + --------------- unassign_domain + + ------------ [$uuid2] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + + ------------ [$uuid3] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + +4. The administrator now needs to configure the matrixes for the mediated + devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). + + This is how the matrix is configured for Guest1:: + + echo 5 > assign_adapter + echo 6 > assign_adapter + echo 4 > assign_domain + echo 0xab > assign_domain + + Control domains can similarly be assigned using the assign_control_domain + sysfs file. + + If a mistake is made configuring an adapter, domain or control domain, + you can use the unassign_xxx files to unassign the adapter, domain or + control domain. + + To display the matrix configuration for Guest1:: + + cat matrix + + This is how the matrix is configured for Guest2:: + + echo 5 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + This is how the matrix is configured for Guest3:: + + echo 6 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + In order to successfully assign an adapter: + + * The adapter number specified must represent a value from 0 up to the + maximum adapter number configured for the system. If an adapter number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the adapter ID and the IDs of + the previously assigned domains must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APID bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the adapter ID and the IDs of the + previously assigned domains can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a domain: + + * The domain number specified must represent a value from 0 up to the + maximum domain number configured for the system. If a domain number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the domain ID and the IDs of + the previously assigned adapters must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APQI bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the domain ID and the IDs of the + previously assigned adapters can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a control domain, the domain number + specified must represent a value from 0 up to the maximum domain number + configured for the system. If a control domain number higher than the maximum + is specified, the operation will terminate with an error (ENODEV). + +5. Start Guest1:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... + +7. Start Guest2:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... + +7. Start Guest3:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... + +When the guest is shut down, the mediated matrix devices may be removed. + +Using our example again, to remove the mediated matrix device $uuid1:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- remove + +:: + + echo 1 > remove + +This will remove all of the mdev matrix device's sysfs structures including +the mdev device itself. To recreate and reconfigure the mdev matrix device, +all of the steps starting with step 3 will have to be performed again. Note +that the remove will fail if a guest using the mdev is still running. + +It is not necessary to remove an mdev matrix device, but one may want to +remove it if no guest will use it during the remaining lifetime of the linux +host. If the mdev matrix device is removed, one may want to also reconfigure +the pool of adapters and queues reserved for use by the default drivers. + +Limitations +=========== +* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN + to the default drivers pool of a queue that is still assigned to a mediated + device in use by a guest. It is incumbent upon the administrator to + ensure there is no mediated device in use by a guest to which the APQN is + assigned lest the host be given access to the private data of the AP queue + device such as a private key configured specifically for the guest. + +* Dynamically modifying the AP matrix for a running guest (which would amount to + hot(un)plug of AP devices for the guest) is currently not supported + +* Live guest migration is not supported for guests using AP devices. diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt deleted file mode 100644 index 65167cfe4485..000000000000 --- a/Documentation/s390/vfio-ap.txt +++ /dev/null @@ -1,837 +0,0 @@ -Introduction: -============ -The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised -of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. -The AP devices provide cryptographic functions to all CPUs assigned to a -linux system running in an IBM Z system LPAR. - -The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap -is to make AP cards available to KVM guests using the VFIO mediated device -framework. This implementation relies considerably on the s390 virtualization -facilities which do most of the hard work of providing direct access to AP -devices. - -AP Architectural Overview: -========================= -To facilitate the comprehension of the design, let's start with some -definitions: - -* AP adapter - - An AP adapter is an IBM Z adapter card that can perform cryptographic - functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters - assigned to the LPAR in which a linux host is running will be available to - the linux host. Each adapter is identified by a number from 0 to 255; however, - the maximum adapter number is determined by machine model and/or adapter type. - When installed, an AP adapter is accessed by AP instructions executed by any - CPU. - - The AP adapter cards are assigned to a given LPAR via the system's Activation - Profile which can be edited via the HMC. When the linux host system is IPL'd - in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and - creates a sysfs device for each assigned adapter. For example, if AP adapters - 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following - sysfs device entries: - - /sys/devices/ap/card04 - /sys/devices/ap/card0a - - Symbolic links to these devices will also be created in the AP bus devices - sub-directory: - - /sys/bus/ap/devices/[card04] - /sys/bus/ap/devices/[card04] - -* AP domain - - An adapter is partitioned into domains. An adapter can hold up to 256 domains - depending upon the adapter type and hardware configuration. A domain is - identified by a number from 0 to 255; however, the maximum domain number is - determined by machine model and/or adapter type.. A domain can be thought of - as a set of hardware registers and memory used for processing AP commands. A - domain can be configured with a secure private key used for clear key - encryption. A domain is classified in one of two ways depending upon how it - may be accessed: - - * Usage domains are domains that are targeted by an AP instruction to - process an AP command. - - * Control domains are domains that are changed by an AP command sent to a - usage domain; for example, to set the secure private key for the control - domain. - - The AP usage and control domains are assigned to a given LPAR via the system's - Activation Profile which can be edited via the HMC. When a linux host system - is IPL'd in the LPAR, the AP bus module detects the AP usage and control - domains assigned to the LPAR. The domain number of each usage domain and - adapter number of each AP adapter are combined to create AP queue devices - (see AP Queue section below). The domain number of each control domain will be - represented in a bitmask and stored in a sysfs file - /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least - significant bit, correspond to domains 0-255. - -* AP Queue - - An AP queue is the means by which an AP command is sent to a usage domain - inside a specific adapter. An AP queue is identified by a tuple - comprised of an AP adapter ID (APID) and an AP queue index (APQI). The - APQI corresponds to a given usage domain number within the adapter. This tuple - forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP - instructions include a field containing the APQN to identify the AP queue to - which the AP command is to be sent for processing. - - The AP bus will create a sysfs device for each APQN that can be derived from - the cross product of the AP adapter and usage domain numbers detected when the - AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage - domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the - following sysfs entries: - - /sys/devices/ap/card04/04.0006 - /sys/devices/ap/card04/04.0047 - /sys/devices/ap/card0a/0a.0006 - /sys/devices/ap/card0a/0a.0047 - - The following symbolic links to these devices will be created in the AP bus - devices subdirectory: - - /sys/bus/ap/devices/[04.0006] - /sys/bus/ap/devices/[04.0047] - /sys/bus/ap/devices/[0a.0006] - /sys/bus/ap/devices/[0a.0047] - -* AP Instructions: - - There are three AP instructions: - - * NQAP: to enqueue an AP command-request message to a queue - * DQAP: to dequeue an AP command-reply message from a queue - * PQAP: to administer the queues - - AP instructions identify the domain that is targeted to process the AP - command; this must be one of the usage domains. An AP command may modify a - domain that is not one of the usage domains, but the modified domain - must be one of the control domains. - -AP and SIE: -========== -Let's now take a look at how AP instructions executed on a guest are interpreted -by the hardware. - -A satellite control block called the Crypto Control Block (CRYCB) is attached to -our main hardware virtualization control block. The CRYCB contains three fields -to identify the adapters, usage domains and control domains assigned to the KVM -guest: - -* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned - to the KVM guest. Each bit in the mask, from left to right (i.e. from most - significant to least significant bit in big endian order), corresponds to - an APID from 0-255. If a bit is set, the corresponding adapter is valid for - use by the KVM guest. - -* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains - assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from - most significant to least significant bit in big endian order), corresponds to - an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue - is valid for use by the KVM guest. - -* The AP Domain Mask field is a bit mask that identifies the AP control domains - assigned to the KVM guest. The ADM bit mask controls which domains can be - changed by an AP command-request message sent to a usage domain from the - guest. Each bit in the mask, from left to right (i.e. from most significant to - least significant bit in big endian order), corresponds to a domain from - 0-255. If a bit is set, the corresponding domain can be modified by an AP - command-request message sent to a usage domain. - -If you recall from the description of an AP Queue, AP instructions include -an APQN to identify the AP queue to which an AP command-request message is to be -sent (NQAP and PQAP instructions), or from which a command-reply message is to -be received (DQAP instruction). The validity of an APQN is defined by the matrix -calculated from the APM and AQM; it is the cross product of all assigned adapter -numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1 -and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6), -(2,5) and (2,6) will be valid for the guest. - -The APQNs can provide secure key functionality - i.e., a private key is stored -on the adapter card for each of its domains - so each APQN must be assigned to -at most one guest or to the linux host. - - Example 1: Valid configuration: - ------------------------------ - Guest1: adapters 1,2 domains 5,6 - Guest2: adapter 1,2 domain 7 - - This is valid because both guests have a unique set of APQNs: - Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); - Guest2 has APQNs (1,7), (2,7) - - Example 2: Valid configuration: - ------------------------------ - Guest1: adapters 1,2 domains 5,6 - Guest2: adapters 3,4 domains 5,6 - - This is also valid because both guests have a unique set of APQNs: - Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); - Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) - - Example 3: Invalid configuration: - -------------------------------- - Guest1: adapters 1,2 domains 5,6 - Guest2: adapter 1 domains 6,7 - - This is an invalid configuration because both guests have access to - APQN (1,6). - -The Design: -=========== -The design introduces three new objects: - -1. AP matrix device -2. VFIO AP device driver (vfio_ap.ko) -3. VFIO AP mediated matrix pass-through device - -The VFIO AP device driver -------------------------- -The VFIO AP (vfio_ap) device driver serves the following purposes: - -1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. - -2. Sets up the VFIO mediated device interfaces to manage a mediated matrix - device and creates the sysfs interfaces for assigning adapters, usage - domains, and control domains comprising the matrix for a KVM guest. - -3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's - SIE state description to grant the guest access to a matrix of AP devices - -Reserve APQNs for exclusive use of KVM guests ---------------------------------------------- -The following block diagram illustrates the mechanism by which APQNs are -reserved: - - +------------------+ - 7 remove | | - +--------------------> cex4queue driver | - | | | - | +------------------+ - | - | - | +------------------+ +-----------------+ - | 5 register driver | | 3 create | | - | +----------------> Device core +----------> matrix device | - | | | | | | - | | +--------^---------+ +-----------------+ - | | | - | | +-------------------+ - | | +-----------------------------------+ | - | | | 4 register AP driver | | 2 register device - | | | | | -+--------+---+-v---+ +--------+-------+-+ -| | | | -| ap_bus +--------------------- > vfio_ap driver | -| | 8 probe | | -+--------^---------+ +--^--^------------+ -6 edit | | | - apmask | +-----------------------------+ | 9 mdev create - aqmask | | 1 modprobe | -+--------+-----+---+ +----------------+-+ +------------------+ -| | | |8 create | mediated | -| admin | | VFIO device core |---------> matrix | -| + | | | device | -+------+-+---------+ +--------^---------+ +--------^---------+ - | | | | - | | 9 create vfio_ap-passthrough | | - | +------------------------------+ | - +-------------------------------------------------------------+ - 10 assign adapter/domain/control domain - -The process for reserving an AP queue for use by a KVM guest is: - -1. The administrator loads the vfio_ap device driver -2. The vfio-ap driver during its initialization will register a single 'matrix' - device with the device core. This will serve as the parent device for - all mediated matrix devices used to configure an AP matrix for a guest. -3. The /sys/devices/vfio_ap/matrix device is created by the device core -4 The vfio_ap device driver will register with the AP bus for AP queue devices - of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap - driver's probe and remove callback interfaces. Devices older than CEX4 queues - are not supported to simplify the implementation by not needlessly - complicating the design by supporting older devices that will go out of - service in the relatively near future, and for which there are few older - systems around on which to test. -5. The AP bus registers the vfio_ap device driver with the device core -6. The administrator edits the AP adapter and queue masks to reserve AP queues - for use by the vfio_ap device driver. -7. The AP bus removes the AP queues reserved for the vfio_ap driver from the - default zcrypt cex4queue driver. -8. The AP bus probes the vfio_ap device driver to bind the queues reserved for - it. -9. The administrator creates a passthrough type mediated matrix device to be - used by a guest -10 The administrator assigns the adapters, usage domains and control domains - to be exclusively used by a guest. - -Set up the VFIO mediated device interfaces ------------------------------------------- -The VFIO AP device driver utilizes the common interface of the VFIO mediated -device core driver to: -* Register an AP mediated bus driver to add a mediated matrix device to and - remove it from a VFIO group. -* Create and destroy a mediated matrix device -* Add a mediated matrix device to and remove it from the AP mediated bus driver -* Add a mediated matrix device to and remove it from an IOMMU group - -The following high-level block diagram shows the main components and interfaces -of the VFIO AP mediated matrix device driver: - - +-------------+ - | | - | +---------+ | mdev_register_driver() +--------------+ - | | Mdev | +<-----------------------+ | - | | bus | | | vfio_mdev.ko | - | | driver | +----------------------->+ |<-> VFIO user - | +---------+ | probe()/remove() +--------------+ APIs - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ - | |Physical | +<-----------------------+ | - | | device | | | vfio_ap.ko |<-> matrix - | |interface| +----------------------->+ | device - | +---------+ | callback +--------------+ - +-------------+ - -During initialization of the vfio_ap module, the matrix device is registered -with an 'mdev_parent_ops' structure that provides the sysfs attribute -structures, mdev functions and callback interfaces for managing the mediated -matrix device. - -* sysfs attribute structures: - * supported_type_groups - The VFIO mediated device framework supports creation of user-defined - mediated device types. These mediated device types are specified - via the 'supported_type_groups' structure when a device is registered - with the mediated device framework. The registration process creates the - sysfs structures for each mediated device type specified in the - 'mdev_supported_types' sub-directory of the device being registered. Along - with the device type, the sysfs attributes of the mediated device type are - provided. - - The VFIO AP device driver will register one mediated device type for - passthrough devices: - /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough - Only the read-only attributes required by the VFIO mdev framework will - be provided: - ... name - ... device_api - ... available_instances - ... device_api - Where: - * name: specifies the name of the mediated device type - * device_api: the mediated device type's API - * available_instances: the number of mediated matrix passthrough devices - that can be created - * device_api: specifies the VFIO API - * mdev_attr_groups - This attribute group identifies the user-defined sysfs attributes of the - mediated device. When a device is registered with the VFIO mediated device - framework, the sysfs attribute files identified in the 'mdev_attr_groups' - structure will be created in the mediated matrix device's directory. The - sysfs attributes for a mediated matrix device are: - * assign_adapter: - * unassign_adapter: - Write-only attributes for assigning/unassigning an AP adapter to/from the - mediated matrix device. To assign/unassign an adapter, the APID of the - adapter is echoed to the respective attribute file. - * assign_domain: - * unassign_domain: - Write-only attributes for assigning/unassigning an AP usage domain to/from - the mediated matrix device. To assign/unassign a domain, the domain - number of the the usage domain is echoed to the respective attribute - file. - * matrix: - A read-only file for displaying the APQNs derived from the cross product - of the adapter and domain numbers assigned to the mediated matrix device. - * assign_control_domain: - * unassign_control_domain: - Write-only attributes for assigning/unassigning an AP control domain - to/from the mediated matrix device. To assign/unassign a control domain, - the ID of the domain to be assigned/unassigned is echoed to the respective - attribute file. - * control_domains: - A read-only file for displaying the control domain numbers assigned to the - mediated matrix device. - -* functions: - * create: - allocates the ap_matrix_mdev structure used by the vfio_ap driver to: - * Store the reference to the KVM structure for the guest using the mdev - * Store the AP matrix configuration for the adapters, domains, and control - domains assigned via the corresponding sysfs attributes files - * remove: - deallocates the mediated matrix device's ap_matrix_mdev structure. This will - be allowed only if a running guest is not using the mdev. - -* callback interfaces - * open: - The vfio_ap driver uses this callback to register a - VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix - device. The open is invoked when QEMU connects the VFIO iommu group - for the mdev matrix device to the MDEV bus. Access to the KVM structure used - to configure the KVM guest is provided via this callback. The KVM structure, - is used to configure the guest's access to the AP matrix defined via the - mediated matrix device's sysfs attribute files. - * release: - unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the - mdev matrix device and deconfigures the guest's AP matrix. - -Configure the APM, AQM and ADM in the CRYCB: -------------------------------------------- -Configuring the AP matrix for a KVM guest will be performed when the -VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier -function is called when QEMU connects to KVM. The guest's AP matrix is -configured via it's CRYCB by: -* Setting the bits in the APM corresponding to the APIDs assigned to the - mediated matrix device via its 'assign_adapter' interface. -* Setting the bits in the AQM corresponding to the domains assigned to the - mediated matrix device via its 'assign_domain' interface. -* Setting the bits in the ADM corresponding to the domain dIDs assigned to the - mediated matrix device via its 'assign_control_domains' interface. - -The CPU model features for AP ------------------------------ -The AP stack relies on the presence of the AP instructions as well as two -facilities: The AP Facilities Test (APFT) facility; and the AP Query -Configuration Information (QCI) facility. These features/facilities are made -available to a KVM guest via the following CPU model features: - -1. ap: Indicates whether the AP instructions are installed on the guest. This - feature will be enabled by KVM only if the AP instructions are installed - on the host. - -2. apft: Indicates the APFT facility is available on the guest. This facility - can be made available to the guest only if it is available on the host (i.e., - facility bit 15 is set). - -3. apqci: Indicates the AP QCI facility is available on the guest. This facility - can be made available to the guest only if it is available on the host (i.e., - facility bit 12 is set). - -Note: If the user chooses to specify a CPU model different than the 'host' -model to QEMU, the CPU model features and facilities need to be turned on -explicitly; for example: - - /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on - -A guest can be precluded from using AP features/facilities by turning them off -explicitly; for example: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off - -Note: If the APFT facility is turned off (apft=off) for the guest, the guest -will not see any AP devices. The zcrypt device drivers that register for type 10 -and newer AP devices - i.e., the cex4card and cex4queue device drivers - need -the APFT facility to ascertain the facilities installed on a given AP device. If -the APFT facility is not installed on the guest, then the probe of device -drivers will fail since only type 10 and newer devices can be configured for -guest use. - -Example: -======= -Let's now provide an example to illustrate how KVM guests may be given -access to AP facilities. For this example, we will show how to configure -three guests such that executing the lszcrypt command on the guests would -look like this: - -Guest1 ------- -CARD.DOMAIN TYPE MODE ------------------------------- -05 CEX5C CCA-Coproc -05.0004 CEX5C CCA-Coproc -05.00ab CEX5C CCA-Coproc -06 CEX5A Accelerator -06.0004 CEX5A Accelerator -06.00ab CEX5C CCA-Coproc - -Guest2 ------- -CARD.DOMAIN TYPE MODE ------------------------------- -05 CEX5A Accelerator -05.0047 CEX5A Accelerator -05.00ff CEX5A Accelerator - -Guest2 ------- -CARD.DOMAIN TYPE MODE ------------------------------- -06 CEX5A Accelerator -06.0047 CEX5A Accelerator -06.00ff CEX5A Accelerator - -These are the steps: - -1. Install the vfio_ap module on the linux host. The dependency chain for the - vfio_ap module is: - * iommu - * s390 - * zcrypt - * vfio - * vfio_mdev - * vfio_mdev_device - * KVM - - To build the vfio_ap module, the kernel build must be configured with the - following Kconfig elements selected: - * IOMMU_SUPPORT - * S390 - * ZCRYPT - * S390_AP_IOMMU - * VFIO - * VFIO_MDEV - * VFIO_MDEV_DEVICE - * KVM - - If using make menuconfig select the following to build the vfio_ap module: - -> Device Drivers - -> IOMMU Hardware Support - select S390 AP IOMMU Support - -> VFIO Non-Privileged userspace driver framework - -> Mediated device driver frramework - -> VFIO driver for Mediated devices - -> I/O subsystem - -> VFIO support for AP devices - -2. Secure the AP queues to be used by the three guests so that the host can not - access them. To secure them, there are two sysfs files that specify - bitmasks marking a subset of the APQN range as 'usable by the default AP - queue device drivers' or 'not usable by the default device drivers' and thus - available for use by the vfio_ap device driver'. The location of the sysfs - files containing the masks are: - - /sys/bus/ap/apmask - /sys/bus/ap/aqmask - - The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs - (APID). Each bit in the mask, from left to right (i.e., from most significant - to least significant bit in big endian order), corresponds to an APID from - 0-255. If a bit is set, the APID is marked as usable only by the default AP - queue device drivers; otherwise, the APID is usable by the vfio_ap - device driver. - - The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes - (APQI). Each bit in the mask, from left to right (i.e., from most significant - to least significant bit in big endian order), corresponds to an APQI from - 0-255. If a bit is set, the APQI is marked as usable only by the default AP - queue device drivers; otherwise, the APQI is usable by the vfio_ap device - driver. - - Take, for example, the following mask: - - 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff - - It indicates: - - 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6 - belong to the vfio_ap device driver's pool. - - The APQN of each AP queue device assigned to the linux host is checked by the - AP bus against the set of APQNs derived from the cross product of APIDs - and APQIs marked as usable only by the default AP queue device drivers. If a - match is detected, only the default AP queue device drivers will be probed; - otherwise, the vfio_ap device driver will be probed. - - By default, the two masks are set to reserve all APQNs for use by the default - AP queue device drivers. There are two ways the default masks can be changed: - - 1. The sysfs mask files can be edited by echoing a string into the - respective sysfs mask file in one of two formats: - - * An absolute hex string starting with 0x - like "0x12345678" - sets - the mask. If the given string is shorter than the mask, it is padded - with 0s on the right; for example, specifying a mask value of 0x41 is - the same as specifying: - - 0x4100000000000000000000000000000000000000000000000000000000000000 - - Keep in mind that the mask reads from left to right (i.e., most - significant to least significant bit in big endian order), so the mask - above identifies device numbers 1 and 7 (01000001). - - If the string is longer than the mask, the operation is terminated with - an error (EINVAL). - - * Individual bits in the mask can be switched on and off by specifying - each bit number to be switched in a comma separated list. Each bit - number string must be prepended with a ('+') or minus ('-') to indicate - the corresponding bit is to be switched on ('+') or off ('-'). Some - valid values are: - - "+0" switches bit 0 on - "-13" switches bit 13 off - "+0x41" switches bit 65 on - "-0xff" switches bit 255 off - - The following example: - +0,-6,+0x47,-0xf0 - - Switches bits 0 and 71 (0x47) on - Switches bits 6 and 240 (0xf0) off - - Note that the bits not specified in the list remain as they were before - the operation. - - 2. The masks can also be changed at boot time via parameters on the kernel - command line like this: - - ap.apmask=0xffff ap.aqmask=0x40 - - This would create the following masks: - - apmask: - 0xffff000000000000000000000000000000000000000000000000000000000000 - - aqmask: - 0x4000000000000000000000000000000000000000000000000000000000000000 - - Resulting in these two pools: - - default drivers pool: adapter 0-15, domain 1 - alternate drivers pool: adapter 16-255, domains 0, 2-255 - - Securing the APQNs for our example: - ---------------------------------- - To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, - 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding - APQNs can either be removed from the default masks: - - echo -5,-6 > /sys/bus/ap/apmask - - echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask - - Or the masks can be set as follows: - - echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ - > apmask - - echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ - > aqmask - - This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, - 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The - sysfs directory for the vfio_ap device driver will now contain symbolic links - to the AP queue devices bound to it: - - /sys/bus/ap - ... [drivers] - ...... [vfio_ap] - ......... [05.0004] - ......... [05.0047] - ......... [05.00ab] - ......... [05.00ff] - ......... [06.0004] - ......... [06.0047] - ......... [06.00ab] - ......... [06.00ff] - - Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) - can be bound to the vfio_ap device driver. The reason for this is to - simplify the implementation by not needlessly complicating the design by - supporting older devices that will go out of service in the relatively near - future and for which there are few older systems on which to test. - - The administrator, therefore, must take care to secure only AP queues that - can be bound to the vfio_ap device driver. The device type for a given AP - queue device can be read from the parent card's sysfs directory. For example, - to see the hardware type of the queue 05.0004: - - cat /sys/bus/ap/devices/card05/hwtype - - The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the - vfio_ap device driver. - -3. Create the mediated devices needed to configure the AP matrixes for the - three guests and to provide an interface to the vfio_ap driver for - use by the guests: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) - --------- create - --------- [devices] - - To create the mediated devices for the three guests: - - uuidgen > create - uuidgen > create - uuidgen > create - - or - - echo $uuid1 > create - echo $uuid2 > create - echo $uuid3 > create - - This will create three mediated devices in the [devices] subdirectory named - after the UUID written to the create attribute file. We call them $uuid1, - $uuid2 and $uuid3 and this is the sysfs directory structure after creation: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] - --------- [devices] - ------------ [$uuid1] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - --------------- unassign_control_domain - --------------- unassign_domain - - ------------ [$uuid2] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain - - ------------ [$uuid3] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain - -4. The administrator now needs to configure the matrixes for the mediated - devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). - - This is how the matrix is configured for Guest1: - - echo 5 > assign_adapter - echo 6 > assign_adapter - echo 4 > assign_domain - echo 0xab > assign_domain - - Control domains can similarly be assigned using the assign_control_domain - sysfs file. - - If a mistake is made configuring an adapter, domain or control domain, - you can use the unassign_xxx files to unassign the adapter, domain or - control domain. - - To display the matrix configuration for Guest1: - - cat matrix - - This is how the matrix is configured for Guest2: - - echo 5 > assign_adapter - echo 0x47 > assign_domain - echo 0xff > assign_domain - - This is how the matrix is configured for Guest3: - - echo 6 > assign_adapter - echo 0x47 > assign_domain - echo 0xff > assign_domain - - In order to successfully assign an adapter: - - * The adapter number specified must represent a value from 0 up to the - maximum adapter number configured for the system. If an adapter number - higher than the maximum is specified, the operation will terminate with - an error (ENODEV). - - * All APQNs that can be derived from the adapter ID and the IDs of - the previously assigned domains must be bound to the vfio_ap device - driver. If no domains have yet been assigned, then there must be at least - one APQN with the specified APID bound to the vfio_ap driver. If no such - APQNs are bound to the driver, the operation will terminate with an - error (EADDRNOTAVAIL). - - No APQN that can be derived from the adapter ID and the IDs of the - previously assigned domains can be assigned to another mediated matrix - device. If an APQN is assigned to another mediated matrix device, the - operation will terminate with an error (EADDRINUSE). - - In order to successfully assign a domain: - - * The domain number specified must represent a value from 0 up to the - maximum domain number configured for the system. If a domain number - higher than the maximum is specified, the operation will terminate with - an error (ENODEV). - - * All APQNs that can be derived from the domain ID and the IDs of - the previously assigned adapters must be bound to the vfio_ap device - driver. If no domains have yet been assigned, then there must be at least - one APQN with the specified APQI bound to the vfio_ap driver. If no such - APQNs are bound to the driver, the operation will terminate with an - error (EADDRNOTAVAIL). - - No APQN that can be derived from the domain ID and the IDs of the - previously assigned adapters can be assigned to another mediated matrix - device. If an APQN is assigned to another mediated matrix device, the - operation will terminate with an error (EADDRINUSE). - - In order to successfully assign a control domain, the domain number - specified must represent a value from 0 up to the maximum domain number - configured for the system. If a control domain number higher than the maximum - is specified, the operation will terminate with an error (ENODEV). - -5. Start Guest1: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... - -7. Start Guest2: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... - -7. Start Guest3: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... - -When the guest is shut down, the mediated matrix devices may be removed. - -Using our example again, to remove the mediated matrix device $uuid1: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] - --------- [devices] - ------------ [$uuid1] - --------------- remove - - - echo 1 > remove - - This will remove all of the mdev matrix device's sysfs structures including - the mdev device itself. To recreate and reconfigure the mdev matrix device, - all of the steps starting with step 3 will have to be performed again. Note - that the remove will fail if a guest using the mdev is still running. - - It is not necessary to remove an mdev matrix device, but one may want to - remove it if no guest will use it during the remaining lifetime of the linux - host. If the mdev matrix device is removed, one may want to also reconfigure - the pool of adapters and queues reserved for use by the default drivers. - -Limitations -=========== -* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN - to the default drivers pool of a queue that is still assigned to a mediated - device in use by a guest. It is incumbent upon the administrator to - ensure there is no mediated device in use by a guest to which the APQN is - assigned lest the host be given access to the private data of the AP queue - device such as a private key configured specifically for the guest. - -* Dynamically modifying the AP matrix for a running guest (which would amount to - hot(un)plug of AP devices for the guest) is currently not supported - -* Live guest migration is not supported for guests using AP devices. diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst new file mode 100644 index 000000000000..1f6d0b56d53e --- /dev/null +++ b/Documentation/s390/vfio-ccw.rst @@ -0,0 +1,326 @@ +================================== +vfio-ccw: the basic infrastructure +================================== + +Introduction +------------ + +Here we describe the vfio support for I/O subchannel devices for +Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a +virtual machine, while vfio is the means. + +Different than other hardware architectures, s390 has defined a unified +I/O access method, which is so called Channel I/O. It has its own access +patterns: + +- Channel programs run asynchronously on a separate (co)processor. +- The channel subsystem will access any memory designated by the caller + in the channel program directly, i.e. there is no iommu involved. + +Thus when we introduce vfio support for these devices, we realize it +with a mediated device (mdev) implementation. The vfio mdev will be +added to an iommu group, so as to make itself able to be managed by the +vfio framework. And we add read/write callbacks for special vfio I/O +regions to pass the channel programs from the mdev to its parent device +(the real I/O subchannel device) to do further address translation and +to perform I/O instructions. + +This document does not intend to explain the s390 I/O architecture in +every detail. More information/reference could be found here: + +- A good start to know Channel I/O in general: + https://en.wikipedia.org/wiki/Channel_I/O +- s390 architecture: + s390 Principles of Operation manual (IBM Form. No. SA22-7832) +- The existing QEMU code which implements a simple emulated channel + subsystem could also be a good reference. It makes it easier to follow + the flow. + qemu/hw/s390x/css.c + +For vfio mediated device framework: +- Documentation/vfio-mediated-device.txt + +Motivation of vfio-ccw +---------------------- + +Typically, a guest virtualized via QEMU/KVM on s390 only sees +paravirtualized virtio devices via the "Virtio Over Channel I/O +(virtio-ccw)" transport. This makes virtio devices discoverable via +standard operating system algorithms for handling channel devices. + +However this is not enough. On s390 for the majority of devices, which +use the standard Channel I/O based mechanism, we also need to provide +the functionality of passing through them to a QEMU virtual machine. +This includes devices that don't have a virtio counterpart (e.g. tape +drives) or that have specific characteristics which guests want to +exploit. + +For passing a device to a guest, we want to use the same interface as +everybody else, namely vfio. We implement this vfio support for channel +devices via the vfio mediated device framework and the subchannel device +driver "vfio_ccw". + +Access patterns of CCW devices +------------------------------ + +s390 architecture has implemented a so called channel subsystem, that +provides a unified view of the devices physically attached to the +systems. Though the s390 hardware platform knows about a huge variety of +different peripheral attachments like disk devices (aka. DASDs), tapes, +communication controllers, etc. They can all be accessed by a well +defined access method and they are presenting I/O completion a unified +way: I/O interruptions. + +All I/O requires the use of channel command words (CCWs). A CCW is an +instruction to a specialized I/O channel processor. A channel program is +a sequence of CCWs which are executed by the I/O channel subsystem. To +issue a channel program to the channel subsystem, it is required to +build an operation request block (ORB), which can be used to point out +the format of the CCW and other control information to the system. The +operating system signals the I/O channel subsystem to begin executing +the channel program with a SSCH (start sub-channel) instruction. The +central processor is then free to proceed with non-I/O instructions +until interrupted. The I/O completion result is received by the +interrupt handler in the form of interrupt response block (IRB). + +Back to vfio-ccw, in short: + +- ORBs and channel programs are built in guest kernel (with guest + physical addresses). +- ORBs and channel programs are passed to the host kernel. +- Host kernel translates the guest physical addresses to real addresses + and starts the I/O with issuing a privileged Channel I/O instruction + (e.g SSCH). +- channel programs run asynchronously on a separate processor. +- I/O completion will be signaled to the host with I/O interruptions. + And it will be copied as IRB to user space to pass it back to the + guest. + +Physical vfio ccw device and its child mdev +------------------------------------------- + +As mentioned above, we realize vfio-ccw with a mdev implementation. + +Channel I/O does not have IOMMU hardware support, so the physical +vfio-ccw device does not have an IOMMU level translation or isolation. + +Subchannel I/O instructions are all privileged instructions. When +handling the I/O instruction interception, vfio-ccw has the software +policing and translation how the channel program is programmed before +it gets sent to hardware. + +Within this implementation, we have two drivers for two types of +devices: + +- The vfio_ccw driver for the physical subchannel device. + This is an I/O subchannel driver for the real subchannel device. It + realizes a group of callbacks and registers to the mdev framework as a + parent (physical) device. As a consequence, mdev provides vfio_ccw a + generic interface (sysfs) to create mdev devices. A vfio mdev could be + created by vfio_ccw then and added to the mediated bus. It is the vfio + device that added to an IOMMU group and a vfio group. + vfio_ccw also provides an I/O region to accept channel program + request from user space and store I/O interrupt result for user + space to retrieve. To notify user space an I/O completion, it offers + an interface to setup an eventfd fd for asynchronous signaling. + +- The vfio_mdev driver for the mediated vfio ccw device. + This is provided by the mdev framework. It is a vfio device driver for + the mdev that created by vfio_ccw. + It realizes a group of vfio device driver callbacks, adds itself to a + vfio group, and registers itself to the mdev framework as a mdev + driver. + It uses a vfio iommu backend that uses the existing map and unmap + ioctls, but rather than programming them into an IOMMU for a device, + it simply stores the translations for use by later requests. This + means that a device programmed in a VM with guest physical addresses + can have the vfio kernel convert that address to process virtual + address, pin the page and program the hardware with the host physical + address in one step. + For a mdev, the vfio iommu backend will not pin the pages during the + VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database + of the iova<->vaddr mappings in this operation. And they export a + vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu + backend for the physical devices to pin and unpin pages by demand. + +Below is a high Level block diagram:: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ccw.ko |<-> subchannel + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +The process of how these work together. + +1. vfio_ccw.ko drives the physical I/O subchannel, and registers the + physical device (with callbacks) to mdev framework. + When vfio_ccw probing the subchannel device, it registers device + pointer and callbacks to the mdev framework. Mdev related file nodes + under the device node in sysfs would be created for the subchannel + device, namely 'mdev_create', 'mdev_destroy' and + 'mdev_supported_types'. +2. Create a mediated vfio ccw device. + Use the 'mdev_create' sysfs file, we need to manually create one (and + only one for our case) mediated device. +3. vfio_mdev.ko drives the mediated ccw device. + vfio_mdev is also the vfio device drvier. It will probe the mdev and + add it to an iommu_group and a vfio_group. Then we could pass through + the mdev to a guest. + +vfio-ccw I/O region +------------------- + +An I/O region is used to accept channel program request from user +space and store I/O interrupt result for user space to retrieve. The +definition of the region is:: + + struct ccw_io_region { + #define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; + #define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; + #define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; + } __packed; + +While starting an I/O request, orb_area should be filled with the +guest ORB, and scsw_area should be filled with the SCSW of the Virtual +Subchannel. + +irb_area stores the I/O result. + +ret_code stores a return code for each access of the region. + +vfio-ccw operation details +-------------------------- + +vfio-ccw follows what vfio-pci did on the s390 platform and uses +vfio-iommu-type1 as the vfio iommu backend. + +* CCW translation APIs + A group of APIs (start with `cp_`) to do CCW translation. The CCWs + passed in by a user space program are organized with their guest + physical memory addresses. These APIs will copy the CCWs into kernel + space, and assemble a runnable kernel channel program by updating the + guest physical addresses with their corresponding host physical addresses. + Note that we have to use IDALs even for direct-access CCWs, as the + referenced memory can be located anywhere, including above 2G. + +* vfio_ccw device driver + This driver utilizes the CCW translation APIs and introduces + vfio_ccw, which is the driver for the I/O subchannel devices you want + to pass through. + vfio_ccw implements the following vfio ioctls:: + + VFIO_DEVICE_GET_INFO + VFIO_DEVICE_GET_IRQ_INFO + VFIO_DEVICE_GET_REGION_INFO + VFIO_DEVICE_RESET + VFIO_DEVICE_SET_IRQS + + This provides an I/O region, so that the user space program can pass a + channel program to the kernel, to do further CCW translation before + issuing them to a real device. + This also provides the SET_IRQ ioctl to setup an event notifier to + notify the user space program the I/O completion in an asynchronous + way. + +The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a +good example to get understand how these patches work. Here is a little +bit more detail how an I/O request triggered by the QEMU guest will be +handled (without error handling). + +Explanation: + +- Q1-Q7: QEMU side process. +- K1-K5: Kernel side process. + +Q1. + Get I/O region info during initialization. + +Q2. + Setup event notifier and handler to handle I/O completion. + +... ... + +Q3. + Intercept a ssch instruction. +Q4. + Write the guest channel program and ORB to the I/O region. + + K1. + Copy from guest to kernel. + K2. + Translate the guest channel program to a host kernel space + channel program, which becomes runnable for a real device. + K3. + With the necessary information contained in the orb passed in + by QEMU, issue the ccwchain to the device. + K4. + Return the ssch CC code. +Q5. + Return the CC code to the guest. + +... ... + + K5. + Interrupt handler gets the I/O result and write the result to + the I/O region. + K6. + Signal QEMU to retrieve the result. + +Q6. + Get the signal and event handler reads out the result from the I/O + region. +Q7. + Update the irb for the guest. + +Limitations +----------- + +The current vfio-ccw implementation focuses on supporting basic commands +needed to implement block device functionality (read/write) of DASD/ECKD +device only. Some commands may need special handling in the future, for +example, anything related to path grouping. + +DASD is a kind of storage device. While ECKD is a data recording format. +More information for DASD and ECKD could be found here: +https://en.wikipedia.org/wiki/Direct-access_storage_device +https://en.wikipedia.org/wiki/Count_key_data + +Together with the corresponding work in QEMU, we can bring the passed +through DASD/ECKD device online in a guest now and use it as a block +device. + +While the current code allows the guest to start channel programs via +START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is +not yet implemented. + +vfio-ccw supports classic (command mode) channel I/O only. Transport +mode (HPF) is not supported. + +QDIO subchannels are currently not supported. Classic devices other than +DASD/ECKD might work, but have not been tested. + +Reference +--------- +1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) +2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) +3. https://en.wikipedia.org/wiki/Channel_I/O +4. Documentation/s390/cds.rst +5. Documentation/vfio.txt +6. Documentation/vfio-mediated-device.txt diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt deleted file mode 100644 index 2be11ad864ff..000000000000 --- a/Documentation/s390/vfio-ccw.txt +++ /dev/null @@ -1,300 +0,0 @@ -vfio-ccw: the basic infrastructure -================================== - -Introduction ------------- - -Here we describe the vfio support for I/O subchannel devices for -Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a -virtual machine, while vfio is the means. - -Different than other hardware architectures, s390 has defined a unified -I/O access method, which is so called Channel I/O. It has its own access -patterns: -- Channel programs run asynchronously on a separate (co)processor. -- The channel subsystem will access any memory designated by the caller - in the channel program directly, i.e. there is no iommu involved. -Thus when we introduce vfio support for these devices, we realize it -with a mediated device (mdev) implementation. The vfio mdev will be -added to an iommu group, so as to make itself able to be managed by the -vfio framework. And we add read/write callbacks for special vfio I/O -regions to pass the channel programs from the mdev to its parent device -(the real I/O subchannel device) to do further address translation and -to perform I/O instructions. - -This document does not intend to explain the s390 I/O architecture in -every detail. More information/reference could be found here: -- A good start to know Channel I/O in general: - https://en.wikipedia.org/wiki/Channel_I/O -- s390 architecture: - s390 Principles of Operation manual (IBM Form. No. SA22-7832) -- The existing QEMU code which implements a simple emulated channel - subsystem could also be a good reference. It makes it easier to follow - the flow. - qemu/hw/s390x/css.c - -For vfio mediated device framework: -- Documentation/vfio-mediated-device.txt - -Motivation of vfio-ccw ----------------------- - -Typically, a guest virtualized via QEMU/KVM on s390 only sees -paravirtualized virtio devices via the "Virtio Over Channel I/O -(virtio-ccw)" transport. This makes virtio devices discoverable via -standard operating system algorithms for handling channel devices. - -However this is not enough. On s390 for the majority of devices, which -use the standard Channel I/O based mechanism, we also need to provide -the functionality of passing through them to a QEMU virtual machine. -This includes devices that don't have a virtio counterpart (e.g. tape -drives) or that have specific characteristics which guests want to -exploit. - -For passing a device to a guest, we want to use the same interface as -everybody else, namely vfio. We implement this vfio support for channel -devices via the vfio mediated device framework and the subchannel device -driver "vfio_ccw". - -Access patterns of CCW devices ------------------------------- - -s390 architecture has implemented a so called channel subsystem, that -provides a unified view of the devices physically attached to the -systems. Though the s390 hardware platform knows about a huge variety of -different peripheral attachments like disk devices (aka. DASDs), tapes, -communication controllers, etc. They can all be accessed by a well -defined access method and they are presenting I/O completion a unified -way: I/O interruptions. - -All I/O requires the use of channel command words (CCWs). A CCW is an -instruction to a specialized I/O channel processor. A channel program is -a sequence of CCWs which are executed by the I/O channel subsystem. To -issue a channel program to the channel subsystem, it is required to -build an operation request block (ORB), which can be used to point out -the format of the CCW and other control information to the system. The -operating system signals the I/O channel subsystem to begin executing -the channel program with a SSCH (start sub-channel) instruction. The -central processor is then free to proceed with non-I/O instructions -until interrupted. The I/O completion result is received by the -interrupt handler in the form of interrupt response block (IRB). - -Back to vfio-ccw, in short: -- ORBs and channel programs are built in guest kernel (with guest - physical addresses). -- ORBs and channel programs are passed to the host kernel. -- Host kernel translates the guest physical addresses to real addresses - and starts the I/O with issuing a privileged Channel I/O instruction - (e.g SSCH). -- channel programs run asynchronously on a separate processor. -- I/O completion will be signaled to the host with I/O interruptions. - And it will be copied as IRB to user space to pass it back to the - guest. - -Physical vfio ccw device and its child mdev -------------------------------------------- - -As mentioned above, we realize vfio-ccw with a mdev implementation. - -Channel I/O does not have IOMMU hardware support, so the physical -vfio-ccw device does not have an IOMMU level translation or isolation. - -Subchannel I/O instructions are all privileged instructions. When -handling the I/O instruction interception, vfio-ccw has the software -policing and translation how the channel program is programmed before -it gets sent to hardware. - -Within this implementation, we have two drivers for two types of -devices: -- The vfio_ccw driver for the physical subchannel device. - This is an I/O subchannel driver for the real subchannel device. It - realizes a group of callbacks and registers to the mdev framework as a - parent (physical) device. As a consequence, mdev provides vfio_ccw a - generic interface (sysfs) to create mdev devices. A vfio mdev could be - created by vfio_ccw then and added to the mediated bus. It is the vfio - device that added to an IOMMU group and a vfio group. - vfio_ccw also provides an I/O region to accept channel program - request from user space and store I/O interrupt result for user - space to retrieve. To notify user space an I/O completion, it offers - an interface to setup an eventfd fd for asynchronous signaling. - -- The vfio_mdev driver for the mediated vfio ccw device. - This is provided by the mdev framework. It is a vfio device driver for - the mdev that created by vfio_ccw. - It realizes a group of vfio device driver callbacks, adds itself to a - vfio group, and registers itself to the mdev framework as a mdev - driver. - It uses a vfio iommu backend that uses the existing map and unmap - ioctls, but rather than programming them into an IOMMU for a device, - it simply stores the translations for use by later requests. This - means that a device programmed in a VM with guest physical addresses - can have the vfio kernel convert that address to process virtual - address, pin the page and program the hardware with the host physical - address in one step. - For a mdev, the vfio iommu backend will not pin the pages during the - VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database - of the iova<->vaddr mappings in this operation. And they export a - vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu - backend for the physical devices to pin and unpin pages by demand. - -Below is a high Level block diagram. - - +-------------+ - | | - | +---------+ | mdev_register_driver() +--------------+ - | | Mdev | +<-----------------------+ | - | | bus | | | vfio_mdev.ko | - | | driver | +----------------------->+ |<-> VFIO user - | +---------+ | probe()/remove() +--------------+ APIs - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ - | |Physical | +<-----------------------+ | - | | device | | | vfio_ccw.ko |<-> subchannel - | |interface| +----------------------->+ | device - | +---------+ | callback +--------------+ - +-------------+ - -The process of how these work together. -1. vfio_ccw.ko drives the physical I/O subchannel, and registers the - physical device (with callbacks) to mdev framework. - When vfio_ccw probing the subchannel device, it registers device - pointer and callbacks to the mdev framework. Mdev related file nodes - under the device node in sysfs would be created for the subchannel - device, namely 'mdev_create', 'mdev_destroy' and - 'mdev_supported_types'. -2. Create a mediated vfio ccw device. - Use the 'mdev_create' sysfs file, we need to manually create one (and - only one for our case) mediated device. -3. vfio_mdev.ko drives the mediated ccw device. - vfio_mdev is also the vfio device drvier. It will probe the mdev and - add it to an iommu_group and a vfio_group. Then we could pass through - the mdev to a guest. - -vfio-ccw I/O region -------------------- - -An I/O region is used to accept channel program request from user -space and store I/O interrupt result for user space to retrieve. The -definition of the region is: - -struct ccw_io_region { -#define ORB_AREA_SIZE 12 - __u8 orb_area[ORB_AREA_SIZE]; -#define SCSW_AREA_SIZE 12 - __u8 scsw_area[SCSW_AREA_SIZE]; -#define IRB_AREA_SIZE 96 - __u8 irb_area[IRB_AREA_SIZE]; - __u32 ret_code; -} __packed; - -While starting an I/O request, orb_area should be filled with the -guest ORB, and scsw_area should be filled with the SCSW of the Virtual -Subchannel. - -irb_area stores the I/O result. - -ret_code stores a return code for each access of the region. - -vfio-ccw operation details --------------------------- - -vfio-ccw follows what vfio-pci did on the s390 platform and uses -vfio-iommu-type1 as the vfio iommu backend. - -* CCW translation APIs - A group of APIs (start with 'cp_') to do CCW translation. The CCWs - passed in by a user space program are organized with their guest - physical memory addresses. These APIs will copy the CCWs into kernel - space, and assemble a runnable kernel channel program by updating the - guest physical addresses with their corresponding host physical addresses. - Note that we have to use IDALs even for direct-access CCWs, as the - referenced memory can be located anywhere, including above 2G. - -* vfio_ccw device driver - This driver utilizes the CCW translation APIs and introduces - vfio_ccw, which is the driver for the I/O subchannel devices you want - to pass through. - vfio_ccw implements the following vfio ioctls: - VFIO_DEVICE_GET_INFO - VFIO_DEVICE_GET_IRQ_INFO - VFIO_DEVICE_GET_REGION_INFO - VFIO_DEVICE_RESET - VFIO_DEVICE_SET_IRQS - This provides an I/O region, so that the user space program can pass a - channel program to the kernel, to do further CCW translation before - issuing them to a real device. - This also provides the SET_IRQ ioctl to setup an event notifier to - notify the user space program the I/O completion in an asynchronous - way. - -The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a -good example to get understand how these patches work. Here is a little -bit more detail how an I/O request triggered by the QEMU guest will be -handled (without error handling). - -Explanation: -Q1-Q7: QEMU side process. -K1-K5: Kernel side process. - -Q1. Get I/O region info during initialization. -Q2. Setup event notifier and handler to handle I/O completion. - -... ... - -Q3. Intercept a ssch instruction. -Q4. Write the guest channel program and ORB to the I/O region. - K1. Copy from guest to kernel. - K2. Translate the guest channel program to a host kernel space - channel program, which becomes runnable for a real device. - K3. With the necessary information contained in the orb passed in - by QEMU, issue the ccwchain to the device. - K4. Return the ssch CC code. -Q5. Return the CC code to the guest. - -... ... - - K5. Interrupt handler gets the I/O result and write the result to - the I/O region. - K6. Signal QEMU to retrieve the result. -Q6. Get the signal and event handler reads out the result from the I/O - region. -Q7. Update the irb for the guest. - -Limitations ------------ - -The current vfio-ccw implementation focuses on supporting basic commands -needed to implement block device functionality (read/write) of DASD/ECKD -device only. Some commands may need special handling in the future, for -example, anything related to path grouping. - -DASD is a kind of storage device. While ECKD is a data recording format. -More information for DASD and ECKD could be found here: -https://en.wikipedia.org/wiki/Direct-access_storage_device -https://en.wikipedia.org/wiki/Count_key_data - -Together with the corresponding work in QEMU, we can bring the passed -through DASD/ECKD device online in a guest now and use it as a block -device. - -While the current code allows the guest to start channel programs via -START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is -not yet implemented. - -vfio-ccw supports classic (command mode) channel I/O only. Transport -mode (HPF) is not supported. - -QDIO subchannels are currently not supported. Classic devices other than -DASD/ECKD might work, but have not been tested. - -Reference ---------- -1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) -2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) -3. https://en.wikipedia.org/wiki/Channel_I/O -4. Documentation/s390/cds.txt -5. Documentation/vfio.txt -6. Documentation/vfio-mediated-device.txt diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst new file mode 100644 index 000000000000..54e8e7caf7e7 --- /dev/null +++ b/Documentation/s390/zfcpdump.rst @@ -0,0 +1,50 @@ +================================== +The s390 SCSI dump tool (zfcpdump) +================================== + +System z machines (z900 or higher) provide hardware support for creating system +dumps on SCSI disks. The dump process is initiated by booting a dump tool, which +has to create a dump of the current (probably crashed) Linux image. In order to +not overwrite memory of the crashed Linux with data of the dump tool, the +hardware saves some memory plus the register sets of the boot CPU before the +dump tool is loaded. There exists an SCLP hardware interface to obtain the saved +memory afterwards. Currently 32 MB are saved. + +This zfcpdump implementation consists of a Linux dump kernel together with +a user space dump tool, which are loaded together into the saved memory region +below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in +the s390-tools package) to make the device bootable. The operator of a Linux +system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump +resides on. + +The user space dump tool accesses the memory of the crashed system by means +of the /proc/vmcore interface. This interface exports the crashed system's +memory and registers in ELF core dump format. To access the memory which has +been saved by the hardware SCLP requests will be created at the time the data +is needed by /proc/vmcore. The tail part of the crashed systems memory which +has not been stashed by hardware can just be copied from real memory. + +To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP +has to be set. + +To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig". + +The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs +under the following locations: + +* kernel: /zfcpdump.image +* ramdisk: /zfcpdump.rd + +The zfcpdump directory is defined in the s390-tools package. + +The user space application of zfcpdump can reside in an intitramfs or an +initrd. It can also be included in a built-in kernel initramfs. The application +reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk. + +The s390-tools package version 1.24.0 and above builds an external zfcpdump +initramfs with a user space application that writes the dump to a SCSI +partition. + +For more information on how to use zfcpdump refer to the s390 'Using the Dump +Tools book', which is available from +http://www.ibm.com/developerworks/linux/linux390. diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.txt deleted file mode 100644 index b064aa59714d..000000000000 --- a/Documentation/s390/zfcpdump.txt +++ /dev/null @@ -1,48 +0,0 @@ -The s390 SCSI dump tool (zfcpdump) - -System z machines (z900 or higher) provide hardware support for creating system -dumps on SCSI disks. The dump process is initiated by booting a dump tool, which -has to create a dump of the current (probably crashed) Linux image. In order to -not overwrite memory of the crashed Linux with data of the dump tool, the -hardware saves some memory plus the register sets of the boot CPU before the -dump tool is loaded. There exists an SCLP hardware interface to obtain the saved -memory afterwards. Currently 32 MB are saved. - -This zfcpdump implementation consists of a Linux dump kernel together with -a user space dump tool, which are loaded together into the saved memory region -below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in -the s390-tools package) to make the device bootable. The operator of a Linux -system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump -resides on. - -The user space dump tool accesses the memory of the crashed system by means -of the /proc/vmcore interface. This interface exports the crashed system's -memory and registers in ELF core dump format. To access the memory which has -been saved by the hardware SCLP requests will be created at the time the data -is needed by /proc/vmcore. The tail part of the crashed systems memory which -has not been stashed by hardware can just be copied from real memory. - -To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP -has to be set. - -To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig". - -The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs -under the following locations: - -* kernel: /zfcpdump.image -* ramdisk: /zfcpdump.rd - -The zfcpdump directory is defined in the s390-tools package. - -The user space application of zfcpdump can reside in an intitramfs or an -initrd. It can also be included in a built-in kernel initramfs. The application -reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk. - -The s390-tools package version 1.24.0 and above builds an external zfcpdump -initramfs with a user space application that writes the dump to a SCSI -partition. - -For more information on how to use zfcpdump refer to the s390 'Using the Dump -Tools book', which is available from -http://www.ibm.com/developerworks/linux/linux390. -- cgit From a20aa857e0c207c27d4b2c98af7d97539faf2cc5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:17 -0300 Subject: s390: include/asm/debug.h add kerneldoc markups Instead of keeping the documentation inside s390dbf.rst, move them to arch/s390/include/asm/debug.h, using standard kernel-doc markups. Keeping the documentation close to the code helps to keep it updated. It also makes easier to document other stuff inside debug.h, as all it needs is to add kernel-doc markups inside it, as the file will be already be included at the produced documentation. - Those were converted to kerneldoc using this script specially designed to parse ths file, and manually editted: Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Heiko Carstens --- Documentation/s390/s390dbf.rst | 672 +---------------------------------------- 1 file changed, 1 insertion(+), 671 deletions(-) (limited to 'Documentation') diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst index ec2a1faa414b..d2595b548879 100644 --- a/Documentation/s390/s390dbf.rst +++ b/Documentation/s390/s390dbf.rst @@ -104,684 +104,14 @@ the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug feature cannot be stopped. If the debug feature is already stopped, it will stay deactivated. ----------------------------------------------------------------------------- - Kernel Interfaces: ------------------ -:: - - debug_info_t *debug_register(char *name, int pages, int nr_areas, - int buf_size); - -Parameter: - name: - Name of debug log (e.g. used for debugfs entry) - pages: - Number of pages, which will be allocated per area - nr_areas: - Number of debug areas - buf_size: - Size of data area in each debug entry - -Return Value: - Handle for generated debug area - - NULL if register failed - -Description: Allocates memory for a debug log - Must not be called within an interrupt handler - ----------------------------------------------------------------------------- - -:: - - debug_info_t *debug_register_mode(char *name, int pages, int nr_areas, - int buf_size, mode_t mode, uid_t uid, - gid_t gid); - -Parameter: - name: - Name of debug log (e.g. used for debugfs entry) - pages: - Number of pages, which will be allocated per area - nr_areas: - Number of debug areas - buf_size: - Size of data area in each debug entry - mode: - File mode for debugfs files. E.g. S_IRWXUGO - uid: - User ID for debugfs files. Currently only 0 is - supported. - gid: - Group ID for debugfs files. Currently only 0 is - supported. - -Return Value: - Handle for generated debug area - - NULL if register failed - -Description: - Allocates memory for a debug log - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- - -:: - - void debug_unregister (debug_info_t * id); - -Parameter: - id: - handle for debug log - -Return Value: - none - -Description: - frees memory for a debug log and removes all registered debug - views. - - Must not be called within an interrupt handler - ---------------------------------------------------------------------------- - -:: - - void debug_set_level (debug_info_t * id, int new_level); - -Parameter: id: handle for debug log - new_level: new debug level - -Return Value: - none - -Description: - Sets new actual debug level if new_level is valid. - ---------------------------------------------------------------------------- - -:: - - bool debug_level_enabled (debug_info_t * id, int level); - -Parameter: - id: - handle for debug log - level: - debug level - -Return Value: - True if level is less or equal to the current debug level. - -Description: - Returns true if debug events for the specified level would be - logged. Otherwise returns false. - ---------------------------------------------------------------------------- - -:: - - void debug_stop_all(void); - -Parameter: - none - -Return Value: - none - -Description: - stops the debug feature if stopping is allowed. Currently - used in case of a kernel oops. - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_event (debug_info_t* id, int level, void* data, - int length); - -Parameter: - id: - handle for debug log - level: - debug level - data: - pointer to data for debug entry - length: - length of data in bytes - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_int_event (debug_info_t * id, int level, - unsigned int data); - debug_entry_t* debug_long_event(debug_info_t * id, int level, - unsigned long data); - -Parameter: - id: - handle for debug log - level: - debug level - data: - integer value for debug entry - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_text_event (debug_info_t * id, int level, - const char* data); - -Parameter: - id: - handle for debug log - level: - debug level - data: - string for debug entry - -Return Value: - Address of written debug entry - -Description: - writes debug entry in ascii format to active debug area - (if level <= actual debug level) - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_sprintf_event (debug_info_t * id, int level, - char* string,...); - -Parameter: - id: - handle for debug log - level: - debug level - string: - format string for debug entry - ...: - varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: - writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level). - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_exception (debug_info_t* id, int level, void* data, - int length); - -Parameter: - id: - handle for debug log - level: - debug level - data: - pointer to data for debug entry - length: - length of data in bytes - -Return Value: - Address of written debug entry - -Description: - writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_int_exception (debug_info_t * id, int level, - unsigned int data); - debug_entry_t* debug_long_exception(debug_info_t * id, int level, - unsigned long data); - -Parameter: id: handle for debug log - level: debug level - data: integer value for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry to active debug area (if level <= actual - debug level) and switches to next debug area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_text_exception (debug_info_t * id, int level, - const char* data); - -Parameter: id: handle for debug log - level: debug level - data: string for debug entry - -Return Value: Address of written debug entry - -Description: writes debug entry in ascii format to active debug area - (if level <= actual debug level) and switches to next debug - area - ---------------------------------------------------------------------------- - -:: - - debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level, - char* string,...); - -Parameter: id: handle for debug log - level: debug level - string: format string for debug entry - ...: varargs used as in sprintf() - -Return Value: Address of written debug entry - -Description: writes debug entry with format string and varargs (longs) to - active debug area (if level $<=$ actual debug level) and - switches to next debug area. - floats and long long datatypes cannot be used as varargs. - ---------------------------------------------------------------------------- - -:: - - int debug_register_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: registers new debug view and creates debugfs dir entry - ---------------------------------------------------------------------------- - -:: - - int debug_unregister_view (debug_info_t * id, struct debug_view *view); - -Parameter: id: handle for debug log - view: pointer to debug view struct - -Return Value: 0 : ok - < 0: Error - -Description: unregisters debug view and removes debugfs dir entry - - +.. kernel-doc:: arch/s390/include/asm/debug.h Predefined views: ----------------- -extern struct debug_view debug_hex_ascii_view; - -extern struct debug_view debug_raw_view; - -extern struct debug_view debug_sprintf_view; - -Examples --------- - -:: - - /* - * hex_ascii- + raw-view Example - */ - - #include - #include - - static debug_info_t* debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and 4 byte data field */ - - debug_info = debug_register ("test", 1, 4, 4 ); - debug_register_view(debug_info,&debug_hex_ascii_view); - debug_register_view(debug_info,&debug_raw_view); - - debug_text_event(debug_info, 4 , "one "); - debug_int_exception(debug_info, 4, 4711); - debug_event(debug_info, 3, &debug_info, 4); - - return 0; - } - - static void cleanup(void) - { - debug_unregister (debug_info); - } - - module_init(init); - module_exit(cleanup); - ---------------------------------------------------------------------------- - -:: - - /* - * sprintf-view Example - */ - - #include - #include - - static debug_info_t* debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and data field for */ - /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - - debug_info = debug_register ("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info,&debug_sprintf_view); - - debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); - debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); - - return 0; - } - - static void cleanup(void) - { - debug_unregister (debug_info); - } - - module_init(init); - module_exit(cleanup); - -Debugfs Interface ------------------ -Views to the debug logs can be investigated through reading the corresponding -debugfs-files: - -Example:: - - > ls /sys/kernel/debug/s390dbf/dasd - flush hex_ascii level pages raw - > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s - 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... - 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE - 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... - 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP - 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD - 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... - 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... - 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... - 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE - 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... - -See section about predefined views for explanation of the above output! - -Changing the debug level ------------------------- - -Example:: - - - > cat /sys/kernel/debug/s390dbf/dasd/level - 3 - > echo "5" > /sys/kernel/debug/s390dbf/dasd/level - > cat /sys/kernel/debug/s390dbf/dasd/level - 5 - -Flushing debug areas --------------------- -Debug areas can be flushed with piping the number of the desired -area (0...n) to the debugfs file "flush". When using "-" all debug areas -are flushed. - -Examples: - -1. Flush debug area 0:: - - > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush - -2. Flush all debug areas:: - - > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush - -Changing the size of debug areas ------------------------------------- -It is possible the change the size of debug areas through piping -the number of pages to the debugfs file "pages". The resize request will -also flush the debug areas. - -Example: - -Define 4 pages for the debug areas of debug feature "dasd":: - - > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages - -Stooping the debug feature --------------------------- -Example: - -1. Check if stopping is allowed:: - - > cat /proc/sys/s390dbf/debug_stoppable - -2. Stop debug feature:: - - > echo 0 > /proc/sys/s390dbf/debug_active - -lcrash Interface ----------------- -It is planned that the dump analysis tool lcrash gets an additional command -'s390dbf' to display all the debug logs. With this tool it will be possible -to investigate the debug logs on a live system and with a memory dump after -a system crash. - -Investigating raw memory ------------------------- -One last possibility to investigate the debug logs at a live -system and after a system crash is to look at the raw memory -under VM or at the Service Element. -It is possible to find the anker of the debug-logs through -the 'debug_area_first' symbol in the System map. Then one has -to follow the correct pointers of the data-structures defined -in debug.h and find the debug-areas in memory. -Normally modules which use the debug feature will also have -a global variable with the pointer to the debug-logs. Following -this pointer it will also be possible to find the debug logs in -memory. - -For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in debug_register() in -order to see the debug entries well formatted. - - -Predefined Views ----------------- - -There are three predefined views: hex_ascii, raw and sprintf. -The hex_ascii view shows the data field in hex and ascii representation -(e.g. '45 43 4b 44 | ECKD'). -The raw view returns a bytestream as the debug areas are stored in memory. - -The sprintf view formats the debug entries in the same way as the sprintf -function would do. The sprintf event/exception functions write to the -debug entry a pointer to the format string (size = sizeof(long)) -and for each vararg a long value. So e.g. for a debug entry with a format -string plus two varargs one would need to allocate a (3 * sizeof(long)) -byte data area in the debug_register() function. - -IMPORTANT: - Using "%s" in sprintf event functions is dangerous. You can only - use "%s" in the sprintf event functions, if the memory for the passed string - is available as long as the debug feature exists. The reason behind this is - that due to performance considerations only a pointer to the string is stored - in the debug feature. If you log a string that is freed afterwards, you will - get an OOPS when inspecting the debug feature, because then the debug feature - will access the already freed memory. - -NOTE: - If using the sprintf view do NOT use other event/exception functions - than the sprintf-event and -exception functions. - -The format of the hex_ascii and sprintf view is as follows: - -- Number of area -- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated - Universal Time (UTC), January 1, 1970) -- level of debug entry -- Exception flag (* = Exception) -- Cpu-Number of calling task -- Return Address to caller -- data field - -The format of the raw view is: - -- Header as described in debug.h -- datafield - -A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view): - -area time level exception cpu caller data (hex + ascii) --------------------------------------------------------------------------- -00 00964419409:440690 1 - 00 88023fe - - -Defining views --------------- - -Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files:: - - struct debug_view { - char name[DEBUG_MAX_PROCF_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; - }; - -where:: - - typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, - int area, - debug_entry_t* entry, - char* out_buf); - - typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); - typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); - typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, const char* user_buf, - size_t in_buf_size, loff_t* offset); - - -The "private_data" member can be used as pointer to view specific data. -It is not used by the debug feature itself. - -The output when reading a debugfs file is structured like this:: - - "prolog_proc output" - - "header_proc output 1" "format_proc output 1" - "header_proc output 2" "format_proc output 2" - "header_proc output 3" "format_proc output 3" - ... - -When a view is read from the debugfs, the Debug Feature calls the -'prolog_proc' once for writing the prolog. -Then 'header_proc' and 'format_proc' are called for each -existing debug entry. - -The input_proc can be used to implement functionality when it is written to -the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level). - -For header_proc there can be used the default function -debug_dflt_header_fn() which is defined in debug.h. -and which produces the same header output as the predefined views. -E.g:: - - 00 00964419409:440761 2 - 00 88023ec - -In order to see how to use the callback functions check the implementation -of the default views! - -Example:: - - #include - - #define UNKNOWNSTR "data: %08x" - - const char* messages[] = - {"This error...........\n", - "That error...........\n", - "Problem..............\n", - "Something went wrong.\n", - "Everything ok........\n", - NULL - }; - - static int debug_test_format_fn( - debug_info_t * id, struct debug_view *view, - char *out_buf, const char *in_buf - ) - { - int i, rc = 0; - - if(id->buf_size >= 4) { - int msg_nr = *((int*)in_buf); - if(msg_nr < sizeof(messages)/sizeof(char*) - 1) - rc += sprintf(out_buf, "%s", messages[msg_nr]); - else - rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); - } - out: - return rc; - } - - struct debug_view debug_test_view = { - "myview", /* name of view */ - NULL, /* no prolog */ - &debug_dflt_header_fn, /* default header for each entry */ - &debug_test_format_fn, /* our own format function */ - NULL, /* no input function */ - NULL /* no private data */ - }; - -test: -===== - :: debug_info_t *debug_info; -- cgit From 7391d7f4b0690a98c5bf6a113c33400e2d29d9f8 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 28 May 2019 10:07:56 +0200 Subject: dt-bindings: clk: meson: add g12b periph clock controller bindings Update the documentation to support clock driver for the Amlogic G12B SoC. G12B clock driver is very close, the main differences are : - the clock tree is duplicated for the both clusters, and the SYS_PLL are swapped between the clusters - G12B has additional clocks like for CSI an other components Reviewed-by: Rob Herring Reviewed-by: Martin Blumenstingl Signed-off-by: Neil Armstrong Signed-off-by: Jerome Brunet --- Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt index 5c8b105be4d6..6eaa52092313 100644 --- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt +++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt @@ -10,6 +10,7 @@ Required Properties: "amlogic,gxl-clkc" for GXL and GXM SoC, "amlogic,axg-clkc" for AXG SoC. "amlogic,g12a-clkc" for G12A SoC. + "amlogic,g12b-clkc" for G12B SoC. - clocks : list of clock phandle, one for each entry clock-names. - clock-names : should contain the following: * "xtal": the platform xtal -- cgit From 5028bd68155985d3d9bc1af570349d7c56608897 Mon Sep 17 00:00:00 2001 From: Xingyu Chen Date: Sat, 8 Jun 2019 21:04:09 +0200 Subject: dt-bindings: interrupt-controller: New binding for Meson-G12A SoC Update the dt-binding document to support new compatible string for the GPIO interrupt controller which found in Amlogic's Meson-G12A SoC. Signed-off-by: Xingyu Chen Signed-off-by: Jianxin Pan Signed-off-by: Martin Blumenstingl Signed-off-by: Marc Zyngier --- .../devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt index 1502a51548bb..7d531d5fff29 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt @@ -15,6 +15,7 @@ Required properties: "amlogic,meson-gxbb-gpio-intc" for GXBB SoCs (S905) or "amlogic,meson-gxl-gpio-intc" for GXL SoCs (S905X, S912) "amlogic,meson-axg-gpio-intc" for AXG SoCs (A113D, A113X) + "amlogic,meson-g12a-gpio-intc" for G12A SoCs (S905D2, S905X2, S905Y2) - reg : Specifies base physical address and size of the registers. - interrupt-controller : Identifies the node as an interrupt controller. - #interrupt-cells : Specifies the number of cells needed to encode an -- cgit From dc96f45074a55f307d9618ebd444991fc643836c Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 6 Jun 2019 15:37:32 +0800 Subject: dt-bindings: interrupt-controller: Update csky mpintc Add trigger type setting for csky,mpintc. The driver also could support #interrupt-cells <1> and it wouldn't invalidate existing DTs. Here we only show the complete format. Signed-off-by: Guo Ren Reviewed-by: Rob Herring Cc: Marc Zyngier Signed-off-by: Marc Zyngier --- .../bindings/interrupt-controller/csky,mpintc.txt | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt index ab921f1698fb..e13405355166 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt @@ -6,11 +6,16 @@ C-SKY Multi-processors Interrupt Controller is designed for ck807/ck810/ck860 SMP soc, and it also could be used in non-SMP system. Interrupt number definition: - 0-15 : software irq, and we use 15 as our IPI_IRQ. 16-31 : private irq, and we use 16 as the co-processor timer. 31-1024: common irq for soc ip. +Interrupt triger mode: (Defined in dt-bindings/interrupt-controller/irq.h) + IRQ_TYPE_LEVEL_HIGH (default) + IRQ_TYPE_LEVEL_LOW + IRQ_TYPE_EDGE_RISING + IRQ_TYPE_EDGE_FALLING + ============================= intc node bindings definition ============================= @@ -26,15 +31,22 @@ intc node bindings definition - #interrupt-cells Usage: required Value type: - Definition: must be <1> + Definition: <2> - interrupt-controller: Usage: required -Examples: +Examples: ("interrupts = ") --------- +#include intc: interrupt-controller { compatible = "csky,mpintc"; - #interrupt-cells = <1>; + #interrupt-cells = <2>; interrupt-controller; }; + + device: device-example { + ... + interrupts = <34 IRQ_TYPE_EDGE_RISING>; + interrupt-parent = <&intc>; + }; -- cgit From 73c699ffe53868d9fc1bd50445ad2c5041f379e3 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 25 May 2019 15:41:38 +0200 Subject: dt-bindings: vendor: Add a bunch of vendors Add all the missing vendors used in Allwinner DTS. Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- .../devicetree/bindings/vendor-prefixes.yaml | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 653dee9d7dd0..e68c839e9f51 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -175,6 +175,8 @@ patternProperties: description: Common Hardware Reference Platform "^chunghwa,.*": description: Chunghwa Picture Tubes Ltd. + "^chuwi,.*": + description: Chuwi Innovation Ltd. "^ciaa,.*": description: Computadora Industrial Abierta Argentina "^cirrus,.*": @@ -185,8 +187,12 @@ patternProperties: description: Chips&Media, Inc. "^cnxt,.*": description: Conexant Systems, Inc. + "^colorfly,.*": + description: Colorful GRP, Shenzhen Xueyushi Technology Ltd. "^compulab,.*": description: CompuLab Ltd. + "^corpro,.*": + description: Chengdu Corpro Technology Co., Ltd. "^cortina,.*": description: Cortina Systems, Inc. "^cosmic,.*": @@ -199,6 +205,8 @@ patternProperties: description: Crystalfontz America, Inc. "^csky,.*": description: Hangzhou C-SKY Microsystems Co., Ltd + "^csq,.*": + description: Shenzen Chuangsiqi Technology Co.,Ltd. "^cubietech,.*": description: Cubietech, Ltd. "^cypress,.*": @@ -219,6 +227,8 @@ patternProperties: description: Devantech, Ltd. "^dh,.*": description: DH electronics GmbH + "^difrnce,.*": + description: Shenzhen Yagu Electronic Technology Co., Ltd. "^digi,.*": description: Digi International Inc. "^digilent,.*": @@ -241,6 +251,8 @@ patternProperties: description: DPTechnics "^dragino,.*": description: Dragino Technology Co., Limited + "^dserve,.*": + description: dServe Technology B.V. "^ea,.*": description: Embedded Artists AB "^ebs-systart,.*": @@ -263,6 +275,8 @@ patternProperties: description: Emlid, Ltd. "^emmicro,.*": description: EM Microelectronic + "^empire-electronix,.*": + description: Empire Electronix "^emtrion,.*": description: emtrion GmbH "^endless,.*": @@ -329,6 +343,8 @@ patternProperties: description: GE Fanuc Intelligent Platforms Embedded Systems, Inc. "^GEFanuc,.*": description: GE Fanuc Intelligent Platforms Embedded Systems, Inc. + "^gemei,.*": + description: Gemei Digital Technology Co., Ltd. "^geniatech,.*": description: Geniatech, Inc. "^giantec,.*": @@ -375,10 +391,14 @@ patternProperties: description: Honeywell "^hp,.*": description: Hewlett Packard + "^hsg,.*": + description: HannStar Display Co. "^holtek,.*": description: Holtek Semiconductor, Inc. "^hwacom,.*": description: HwaCom Systems Inc. + "^hyundai,.*": + description: Hyundai Technology "^i2se,.*": description: I2SE GmbH "^ibm,.*": @@ -393,6 +413,10 @@ patternProperties: description: ILI Technology Corporation (ILITEK) "^img,.*": description: Imagination Technologies Ltd. + "^incircuit,.*": + description: In-Circuit GmbH + "^inet-tek,.*": + description: Shenzhen iNet Mobile Internet Technology Co., Ltd "^infineon,.*": description: Infineon Technologies "^inforce,.*": @@ -427,6 +451,8 @@ patternProperties: description: Japan Display Inc. "^jedec,.*": description: JEDEC Solid State Technology Association + "^jesurun,.*": + description: Shenzhen Jesurun Electronics Business Dept. "^jianda,.*": description: Jiandangjing Technology Co., Ltd. "^karo,.*": @@ -459,6 +485,8 @@ patternProperties: description: LaCie "^laird,.*": description: Laird PLC + "^lamobo,.*": + description: Ketai Huajie Technology Co., Ltd. "^lantiq,.*": description: Lantiq Semiconductor "^lattice,.*": @@ -477,6 +505,8 @@ patternProperties: description: Lichee Pi "^linaro,.*": description: Linaro Limited + "^linksprite,.*": + description: LinkSprite Technologies, Inc. "^linksys,.*": description: Belkin International, Inc. (Linksys) "^linux,.*": @@ -493,6 +523,8 @@ patternProperties: description: Liebherr-Werk Nenzing GmbH "^macnica,.*": description: Macnica Americas + "^mapleboard,.*": + description: Mapleboard.org "^marvell,.*": description: Marvell Technology Group Ltd. "^maxbotix,.*": @@ -533,6 +565,8 @@ patternProperties: description: Micron Technology Inc. "^mikroe,.*": description: MikroElektronika d.o.o. + "^miniand,.*": + description: Miniand Tech "^minix,.*": description: MINIX Technology Ltd. "^miramems,.*": @@ -663,6 +697,8 @@ patternProperties: description: Picochip Ltd "^pine64,.*": description: Pine64 + "^pineriver,.*": + description: Shenzhen PineRiver Designs Co., Ltd. "^pixcir,.*": description: PIXCIR MICROELECTRONICS Co., Ltd "^plantower,.*": @@ -675,12 +711,18 @@ patternProperties: description: Broadcom Corporation (formerly PLX Technology) "^pni,.*": description: PNI Sensor Corporation + "^polaroid,.*": + description: Polaroid Corporation "^portwell,.*": description: Portwell Inc. "^poslab,.*": description: Poslab Technology Co., Ltd. + "^pov,.*": + description: Point of View International B.V. "^powervr,.*": description: PowerVR (deprecated, use img) + "^primux,.*": + description: Primux Trading, S.L. "^probox2,.*": description: PROBOX2 (by W2COMP Co., Ltd.) "^pulsedlight,.*": @@ -693,6 +735,8 @@ patternProperties: description: QEMU, a generic and open source machine emulator and virtualizer "^qi,.*": description: Qi Hardware + "^qihua,.*": + description: Chengdu Kaixuan Information Technology Co., Ltd. "^qiaodian,.*": description: QiaoDian XianShi Corporation "^qnap,.*": @@ -715,6 +759,8 @@ patternProperties: description: Realtek Semiconductor Corp. "^renesas,.*": description: Renesas Electronics Corporation + "^rervision,.*": + description: Shenzhen Rervision Technology Co., Ltd. "^richtek,.*": description: Richtek Technology Corporation "^ricoh,.*": @@ -785,6 +831,10 @@ patternProperties: description: Silicon Mitus, Inc. "^simtek,.*": description: Cypress Semiconductor Corporation (Simtek Corporation) + "^sinlinx,.*": + description: Sinlinx Electronics Technology Co., LTD + "^sinovoip,.*": + description: SinoVoip Co., Ltd "^sirf,.*": description: SiRF Technology, Inc. "^sis,.*": @@ -903,6 +953,8 @@ patternProperties: description: United Radiant Technology Corporation "^usi,.*": description: Universal Scientific Industrial Co., Ltd. + "^utoo,.*": + description: Aigo Digital Technology Co., Ltd. "^v3,.*": description: V3 Semiconductor "^vamrs,.*": @@ -939,10 +991,14 @@ patternProperties: description: Winbond Electronics corp. "^winstar,.*": description: Winstar Display Corp. + "^wits,.*": + description: Shenzhen Merrii Technology Co., Ltd. (WITS) "^wlf,.*": description: Wolfson Microelectronics "^wm,.*": description: Wondermedia Technologies, Inc. + "^wobo,.*": + description: Wobo "^x-powers,.*": description: X-Powers "^xes,.*": @@ -953,6 +1009,8 @@ patternProperties: description: Xilinx "^xunlong,.*": description: Shenzhen Xunlong Software CO.,Limited + "^yones-toptech,.*": + description: Yones Toptech Co., Ltd. "^ysoft,.*": description: Y Soft Corporation a.s. "^zarlink,.*": -- cgit From a1ccca0e84243c8aa39f4700cecc200b36c6b50f Mon Sep 17 00:00:00 2001 From: Maxime Jourdan Date: Thu, 6 Jun 2019 12:05:10 -0400 Subject: media: dt-bindings: media: add Amlogic Video Decoder Bindings Add documentation for the meson vdec dts node. Signed-off-by: Maxime Jourdan Reviewed-by: Rob Herring Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../devicetree/bindings/media/amlogic,vdec.txt | 71 ++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 Documentation/devicetree/bindings/media/amlogic,vdec.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/media/amlogic,vdec.txt b/Documentation/devicetree/bindings/media/amlogic,vdec.txt new file mode 100644 index 000000000000..aabdd01bcf32 --- /dev/null +++ b/Documentation/devicetree/bindings/media/amlogic,vdec.txt @@ -0,0 +1,71 @@ +Amlogic Video Decoder +================================ + +The video decoding IP lies within the DOS memory region, +except for the hardware bitstream parser that makes use of an undocumented +region. + +It makes use of the following blocks: + +- ESPARSER is a bitstream parser that outputs to a VIFIFO. Further VDEC blocks +then feed from this VIFIFO. +- VDEC_1 can decode MPEG-1, MPEG-2, MPEG-4 part 2, MJPEG, H.263, H.264, VC-1. +- VDEC_HEVC can decode HEVC and VP9. + +Both VDEC_1 and VDEC_HEVC share the "vdec" IRQ and as such cannot run +concurrently. + +Device Tree Bindings: +--------------------- + +VDEC: Video Decoder +-------------------------- + +Required properties: +- compatible: value should be different for each SoC family as : + - GXBB (S905) : "amlogic,gxbb-vdec" + - GXL (S905X, S905D) : "amlogic,gxl-vdec" + - GXM (S912) : "amlogic,gxm-vdec" +- reg: base address and size of he following memory-mapped regions : + - dos + - esparser +- reg-names: should contain the names of the previous memory regions +- interrupts: should contain the following IRQs: + - vdec + - esparser +- interrupt-names: should contain the names of the previous interrupts +- amlogic,ao-sysctrl: should point to the AOBUS sysctrl node +- amlogic,canvas: should point to a canvas provider node +- clocks: should contain the following clocks : + - dos_parser + - dos + - vdec_1 + - vdec_hevc +- clock-names: should contain the names of the previous clocks +- resets: should contain the parser reset +- reset-names: should be "esparser" + +Example: + +vdec: video-decoder@c8820000 { + compatible = "amlogic,gxbb-vdec"; + reg = <0x0 0xc8820000 0x0 0x10000>, + <0x0 0xc110a580 0x0 0xe4>; + reg-names = "dos", "esparser"; + + interrupts = , + ; + interrupt-names = "vdec", "esparser"; + + amlogic,ao-sysctrl = <&sysctrl_AO>; + amlogic,canvas = <&canvas>; + + clocks = <&clkc CLKID_DOS_PARSER>, + <&clkc CLKID_DOS>, + <&clkc CLKID_VDEC_1>, + <&clkc CLKID_VDEC_HEVC>; + clock-names = "dos_parser", "dos", "vdec_1", "vdec_hevc"; + + resets = <&reset RESET_PARSER>; + reset-names = "esparser"; +}; -- cgit From f953d33ba1225d68cf8790b4706d8c4410b15926 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 10 Jun 2019 21:40:02 -0700 Subject: net/tls: add kernel-driven TLS RX resync TLS offload device may lose sync with the TCP stream if packets arrive out of order. Drivers can currently request a resync at a specific TCP sequence number. When a record is found starting at that sequence number kernel will inform the device of the corresponding record number. This requires the device to constantly scan the stream for a known pattern (constant bytes of the header) after sync is lost. This patch adds an alternative approach which is entirely under the control of the kernel. Kernel tracks records it had to fully decrypt, even though TLS socket is in TLS_HW mode. If multiple records did not have any decrypted parts - it's a pretty strong indication that the device is out of sync. We choose the min number of fully encrypted records to be 2, which should hopefully be more than will get retransmitted at a time. After kernel decides the device is out of sync it schedules a resync request. If the TCP socket is empty the resync gets performed immediately. If socket is not empty we leave the record parser to resync when next record comes. Before resync in message parser we peek at the TCP socket and don't attempt the sync if the socket already has some of the next record queued. On resync failure (encrypted data continues to flow in) we retry with exponential backoff, up to once every 128 records (with a 16k record thats at most once every 2M of data). Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index eb7c9b81ccf5..d134d63307e7 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -268,6 +268,9 @@ Device can only detect that segment 4 also contains a TLS header if it knows the length of the previous record from segment 2. In this case the device will lose synchronization with the stream. +Stream scan resynchronization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + When the device gets out of sync and the stream reaches TCP sequence numbers more than a max size record past the expected TCP sequence number, the device starts scanning for a known header pattern. For example @@ -298,6 +301,22 @@ Special care has to be taken if the confirmation request is passed asynchronously to the packet stream and record may get processed by the kernel before the confirmation request. +Stack-driven resynchronization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The driver may also request the stack to perform resynchronization +whenever it sees the records are no longer getting decrypted. +If the connection is configured in this mode the stack automatically +schedules resynchronization after it has received two completely encrypted +records. + +The stack waits for the socket to drain and informs the device about +the next expected record number and its TCP sequence number. If the +records continue to be received fully encrypted stack retries the +synchronization with an exponential back off (first after 2 encrypted +records, then after 4 records, after 8, after 16... up until every +128 records). + Error handling ============== -- cgit From 50180074099fcda752d9d56282d23242b126ebc9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 10 Jun 2019 21:40:09 -0700 Subject: net/tls: add kernel-driven resync mechanism for TX TLS offload drivers keep track of TCP seq numbers to make sure the packets are fed into the HW in order. When packets get dropped on the way through the stack, the driver will get out of sync and have to use fallback encryption, but unless TCP seq number is resynced it will never match the packets correctly (or even worse - use incorrect record sequence number after TCP seq wraps). Existing drivers (mlx5) feed the entire record on every out-of-order event, allowing FW/HW to always be in sync. This patch adds an alternative, more akin to the RX resync. When driver sees a frame which is past its expected sequence number the stream must have gotten out of order (if the sequence number is smaller than expected its likely a retransmission which doesn't require resync). Driver will ask the stack to perform TX sync before it submits the next full record, and fall back to software crypto until stack has performed the sync. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 35 +++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index d134d63307e7..048e5ca44824 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -206,7 +206,11 @@ TX Segments transmitted from an offloaded socket can get out of sync in similar ways to the receive side-retransmissions - local drops -are possible, though network reorders are not. +are possible, though network reorders are not. There are currently +two mechanisms for dealing with out of order segments. + +Crypto state rebuilding +~~~~~~~~~~~~~~~~~~~~~~~ Whenever an out of order segment is transmitted the driver provides the device with enough information to perform cryptographic operations. @@ -225,6 +229,35 @@ was just a retransmission. The former is simpler, and does not require retransmission detection therefore it is the recommended method until such time it is proven inefficient. +Next record sync +~~~~~~~~~~~~~~~~ + +Whenever an out of order segment is detected the driver requests +that the ``ktls`` software fallback code encrypt it. If the segment's +sequence number is lower than expected the driver assumes retransmission +and doesn't change device state. If the segment is in the future, it +may imply a local drop, the driver asks the stack to sync the device +to the next record state and falls back to software. + +Resync request is indicated with: + +.. code-block:: c + + void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) + +Until resync is complete driver should not access its expected TCP +sequence number (as it will be updated from a different context). +Following helper should be used to test if resync is complete: + +.. code-block:: c + + bool tls_offload_tx_resync_pending(struct sock *sk) + +Next time ``ktls`` pushes a record it will first send its TCP sequence number +and TLS record number to the driver. Stack will also make sure that +the new record will start on a segment boundary (like it does when +the connection is initially added). + RX -- -- cgit From b365c124f1ef921125e6c355b5d16a7d165138c4 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 3 Jun 2019 11:10:06 +0200 Subject: dt-bindings: arm: amlogic: add G12B bindings Add compatible for the Amlogic G12B SoC, sharing most of the features and architecture with the G12A SoC. Signed-off-by: Neil Armstrong Reviewed-by: Rob Herring Reviewed-by: Martin Blumenstingl Signed-off-by: Kevin Hilman --- Documentation/devicetree/bindings/arm/amlogic.yaml | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml index a0dd12be4de4..8434be88d599 100644 --- a/Documentation/devicetree/bindings/arm/amlogic.yaml +++ b/Documentation/devicetree/bindings/arm/amlogic.yaml @@ -135,4 +135,8 @@ properties: - amlogic,u200 - const: amlogic,g12a + - description: Boards with the Amlogic Meson G12B S922X SoC + items: + - const: amlogic,g12b + ... -- cgit From 3113784003dd7ca287fe082ca87969d4de721907 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 3 Jun 2019 11:10:07 +0200 Subject: dt-bindings: arm: amlogic: add Odroid-N2 binding Add compatible for the Amlogic G12B (S922X) SoC based Odroid-N2 SBC from HardKernel. Signed-off-by: Neil Armstrong Reviewed-by: Rob Herring Reviewed-by: Martin Blumenstingl Signed-off-by: Kevin Hilman --- Documentation/devicetree/bindings/arm/amlogic.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml index 8434be88d599..325c6fd3566d 100644 --- a/Documentation/devicetree/bindings/arm/amlogic.yaml +++ b/Documentation/devicetree/bindings/arm/amlogic.yaml @@ -137,6 +137,8 @@ properties: - description: Boards with the Amlogic Meson G12B S922X SoC items: + - enum: + - hardkernel,odroid-n2 - const: amlogic,g12b ... -- cgit From 8ad2b4b371bcfe7069ee28ad18d722e1240a0a97 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 11 Jun 2019 21:45:33 -0700 Subject: dt-bindings: soc: qcom: Add AOSS QMP binding Add binding for the QMP based side-channel communication mechanism to the AOSS, which is used to control resources not exposed through the RPMh interface. Reviewed-by: Rob Herring Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt | 81 ++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt new file mode 100644 index 000000000000..954ffee0a9c4 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt @@ -0,0 +1,81 @@ +Qualcomm Always-On Subsystem side channel binding + +This binding describes the hardware component responsible for side channel +requests to the always-on subsystem (AOSS), used for certain power management +requests that is not handled by the standard RPMh interface. Each client in the +SoC has it's own block of message RAM and IRQ for communication with the AOSS. +The protocol used to communicate in the message RAM is known as Qualcomm +Messaging Protocol (QMP) + +The AOSS side channel exposes control over a set of resources, used to control +a set of debug related clocks and to affect the low power state of resources +related to the secondary subsystems. These resources are exposed as a set of +power-domains. + +- compatible: + Usage: required + Value type: + Definition: must be "qcom,sdm845-aoss-qmp" + +- reg: + Usage: required + Value type: + Definition: the base address and size of the message RAM for this + client's communication with the AOSS + +- interrupts: + Usage: required + Value type: + Definition: should specify the AOSS message IRQ for this client + +- mboxes: + Usage: required + Value type: + Definition: reference to the mailbox representing the outgoing doorbell + in APCS for this client, as described in mailbox/mailbox.txt + +- #clock-cells: + Usage: optional + Value type: + Definition: must be 0 + The single clock represents the QDSS clock. + +- #power-domain-cells: + Usage: optional + Value type: + Definition: must be 1 + The provided power-domains are: + CDSP state (0), LPASS state (1), modem state (2), SLPI + state (3), SPSS state (4) and Venus state (5). + += SUBNODES +The AOSS side channel also provides the controls for three cooling devices, +these are expressed as subnodes of the QMP node. The name of the node is used +to identify the resource and must therefor be "cx", "mx" or "ebi". + +- #cooling-cells: + Usage: optional + Value type: + Definition: must be 2 + += EXAMPLE + +The following example represents the AOSS side-channel message RAM and the +mechanism exposing the power-domains, as found in SDM845. + + aoss_qmp: qmp@c300000 { + compatible = "qcom,sdm845-aoss-qmp"; + reg = <0x0c300000 0x100000>; + interrupts = ; + mboxes = <&apss_shared 0>; + + #power-domain-cells = <1>; + + cx_cdev: cx { + #cooling-cells = <2>; + }; + + mx_cdev: mx { + #cooling-cells = <2>; + }; + }; -- cgit From adfd373820906d376c8b643f1a279ac809605b6b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 3 Jun 2019 08:53:35 +0200 Subject: iommu: Introduce IOMMU_RESV_DIRECT_RELAXABLE reserved memory regions Introduce a new type for reserved region. This corresponds to directly mapped regions which are known to be relaxable in some specific conditions, such as device assignment use case. Well known examples are those used by USB controllers providing PS/2 keyboard emulation for pre-boot BIOS and early BOOT or RMRRs associated to IGD working in legacy mode. Since commit c875d2c1b808 ("iommu/vt-d: Exclude devices using RMRRs from IOMMU API domains") and commit 18436afdc11a ("iommu/vt-d: Allow RMRR on graphics devices too"), those regions are currently considered "safe" with respect to device assignment use case which requires a non direct mapping at IOMMU physical level (RAM GPA -> HPA mapping). Those RMRRs currently exist and sometimes the device is attempting to access it but this has not been considered an issue until now. However at the moment, iommu_get_group_resv_regions() is not able to make any difference between directly mapped regions: those which must be absolutely enforced and those like above ones which are known as relaxable. This is a blocker for reporting severe conflicts between non relaxable RMRRs (like MSI doorbells) and guest GPA space. With this new reserved region type we will be able to use iommu_get_group_resv_regions() to enumerate the IOVA space that is usable through the IOMMU API without introducing regressions with respect to existing device assignment use cases (USB and IGD). Signed-off-by: Eric Auger Signed-off-by: Joerg Roedel --- Documentation/ABI/testing/sysfs-kernel-iommu_groups | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups index 35c64e00b35c..017f5bc3920c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups +++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups @@ -24,3 +24,12 @@ Description: /sys/kernel/iommu_groups/reserved_regions list IOVA region is described on a single line: the 1st field is the base IOVA, the second is the end IOVA and the third field describes the type of the region. + +What: /sys/kernel/iommu_groups/reserved_regions +Date: June 2019 +KernelVersion: v5.3 +Contact: Eric Auger +Description: In case an RMRR is used only by graphics or USB devices + it is now exposed as "direct-relaxable" instead of "direct". + In device assignment use case, for instance, those RMRR + are considered to be relaxable and safe. -- cgit From 711486fd18596315d42cebaac3dba8c408f60a3d Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Thu, 6 Jun 2019 09:22:36 +0800 Subject: Documentation/filesystems/proc.txt: Add arch_status file Add documentation for /proc//arch_status file and the x86 specific AVX512_elapsed_ms entry in it. [ tglx: Massage changelog ] Signed-off-by: Aubrey Li Signed-off-by: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: peterz@infradead.org Cc: hpa@zytor.com Cc: ak@linux.intel.com Cc: tim.c.chen@linux.intel.com Cc: dave.hansen@intel.com Cc: arjan@linux.intel.com Cc: adobriyan@gmail.com Cc: aubrey.li@intel.com Cc: linux-api@vger.kernel.org Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Arjan van de Ven Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Linux API Link: https://lkml.kernel.org/r/20190606012236.9391-3-aubrey.li@linux.intel.com --- Documentation/filesystems/proc.txt | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 66cad5c86171..a226061fa109 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -45,6 +45,7 @@ Table of Contents 3.9 /proc//map_files - Information about memory mapped files 3.10 /proc//timerslack_ns - Task timerslack value 3.11 /proc//patch_state - Livepatch patch operation state + 3.12 /proc//arch_status - Task architecture specific information 4 Configuring procfs 4.1 Mount options @@ -1948,6 +1949,45 @@ patched. If the patch is being enabled, then the task has already been patched. If the patch is being disabled, then the task hasn't been unpatched yet. +3.12 /proc//arch_status - task architecture specific status +------------------------------------------------------------------- +When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the +architecture specific status of the task. + +Example +------- + $ cat /proc/6753/arch_status + AVX512_elapsed_ms: 8 + +Description +----------- + +x86 specific entries: +--------------------- + AVX512_elapsed_ms: + ------------------ + If AVX512 is supported on the machine, this entry shows the milliseconds + elapsed since the last time AVX512 usage was recorded. The recording + happens on a best effort basis when a task is scheduled out. This means + that the value depends on two factors: + + 1) The time which the task spent on the CPU without being scheduled + out. With CPU isolation and a single runnable task this can take + several seconds. + + 2) The time since the task was scheduled out last. Depending on the + reason for being scheduled out (time slice exhausted, syscall ...) + this can be arbitrary long time. + + As a consequence the value cannot be considered precise and authoritative + information. The application which uses this information has to be aware + of the overall scenario on the system in order to determine whether a + task is a real AVX512 user or not. Precise information can be obtained + with performance counters. + + A special value of '-1' indicates that no AVX512 usage was recorded, thus + the task is unlikely an AVX512 user, but depends on the workload and the + scheduling scenario, it also could be a false negative mentioned above. ------------------------------------------------------------------------------ Configuring procfs -- cgit From f40043b368ae787837ef659728f0b08bd34f5317 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 25 Apr 2019 17:34:42 +0200 Subject: dt-bindings: phy: tegra-xusb: List PLL power supplies These power supplies provide power for various PLLs that are set up and driven by the XUSB pad controller. These power supplies were previously improperly added to the PCIe and XUSB controllers, but depending on the driver probe order, power to the PLLs will not be supplied soon enough and cause initialization to fail. Reviewed-by: Rob Herring Signed-off-by: Thierry Reding Acked-by: Jon Hunter Signed-off-by: Kishon Vijay Abraham I --- .../devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt index daedb15f322e..9fb682e47c29 100644 --- a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt +++ b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt @@ -42,6 +42,18 @@ Required properties: - reset-names: Must include the following entries: - "padctl" +For Tegra124: +- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V. +- avdd-pll-erefe-supply: PLLE reference PLL power supply. Must supply 1.05 V. +- avdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V. +- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 3.3 V. + +For Tegra210: +- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V. +- avdd-pll-uerefe-supply: PLLE reference PLL power supply. Must supply 1.05 V. +- dvdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V. +- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 1.8 V. + For Tegra186: - avdd-pll-erefeut-supply: UPHY brick and reference clock as well as UTMI PHY power supply. Must supply 1.8 V. -- cgit From 0ab2c44def8f6cc637a5fb3ce9766d69d2c289d0 Mon Sep 17 00:00:00 2001 From: Volodymyr Babchuk Date: Thu, 23 May 2019 11:23:35 +0000 Subject: dt-bindings: arm: fix the document ID for SCMI protocol documentation arm,scmi.txt used the wrong document identifier. "ARM DUI 0922B" is the "ARM Compute Subsystem SCP, Message Interface Protocols". What we need is the ARM DEN 0056A - "ARM System Control and Management Interface Platform Design Document". Fixes: fe7be8b297b2 ("dt-bindings: arm: add support for ARM System Control and Management Interface(SCMI) protocol") Signed-off-by: Volodymyr Babchuk Signed-off-by: Sudeep Holla --- Documentation/devicetree/bindings/arm/arm,scmi.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/arm,scmi.txt b/Documentation/devicetree/bindings/arm/arm,scmi.txt index 5f3719ab7075..317a2fc3667a 100644 --- a/Documentation/devicetree/bindings/arm/arm,scmi.txt +++ b/Documentation/devicetree/bindings/arm/arm,scmi.txt @@ -6,7 +6,7 @@ that are provided by the hardware platform it is running on, including power and performance functions. This binding is intended to define the interface the firmware implementing -the SCMI as described in ARM document number ARM DUI 0922B ("ARM System Control +the SCMI as described in ARM document number ARM DEN 0056A ("ARM System Control and Management Interface Platform Design Document")[0] provide for OSPM in the device tree. -- cgit From 1d90dff62e16fee5bdc761c3230cc8a4c79814fc Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Tue, 11 Jun 2019 22:09:30 +0800 Subject: dt-bindings: pinctrl: add missing compatible string for V3s The pinctrl driver of V3s is already available and used in the kernel, but the compatible string of it is forgotten to be added. Add the missing compatible string. Signed-off-by: Icenowy Zheng Acked-by: Maxime Ripard Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt index cf96b7c20e4d..baba55db864c 100644 --- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt @@ -24,6 +24,7 @@ Required properties: "allwinner,sun8i-h3-pinctrl" "allwinner,sun8i-h3-r-pinctrl" "allwinner,sun8i-r40-pinctrl" + "allwinner,sun8i-v3s-pinctrl" "allwinner,sun50i-a64-pinctrl" "allwinner,sun50i-a64-r-pinctrl" "allwinner,sun50i-h5-pinctrl" -- cgit From 2e80e10f2d71f42c6c8f82fc173b21be856acc0f Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Tue, 11 Jun 2019 22:09:31 +0800 Subject: dt-bindings: pinctrl: add compatible string for Allwinner V3 pinctrl The Allwinner V3 SoC, despite come with the same die with V3s, has more GPIO pins than V3s, and a different compatible string for pinctrl is needed. Add the compatible string for V3 pinctrl. Signed-off-by: Icenowy Zheng Reviewed-by: Rob Herring Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt index baba55db864c..328585c6da58 100644 --- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt @@ -24,6 +24,7 @@ Required properties: "allwinner,sun8i-h3-pinctrl" "allwinner,sun8i-h3-r-pinctrl" "allwinner,sun8i-r40-pinctrl" + "allwinner,sun8i-v3-pinctrl" "allwinner,sun8i-v3s-pinctrl" "allwinner,sun50i-a64-pinctrl" "allwinner,sun50i-a64-r-pinctrl" -- cgit From 6a80b30086b861b2591ba2a953042abd08c498e3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 10 Jun 2019 16:04:39 +0200 Subject: fmc: Delete the FMC subsystem The FMC subsystem was created in 2012 with the ambition to drive development of drivers for this hardware upstream. The current implementation has architectural flaws and would need to be revamped using real hardware to something that can reuse existing kernel abstractions in the subsystems for e.g. I2C, FPGA and GPIO. We have concluded that for the mainline kernel it will be better to delete the subsystem and start over with a clean slate when/if an active maintainer steps up. For details see: https://lkml.org/lkml/2018/10/29/534 Suggested-by: Federico Vaga Cc: Pat Riehecky Acked-by: Alessandro Rubini Signed-off-by: Federico Vaga Signed-off-by: Linus Walleij --- Documentation/fmc/API.txt | 47 ----- Documentation/fmc/FMC-and-SDB.txt | 88 ---------- Documentation/fmc/carrier.txt | 311 --------------------------------- Documentation/fmc/fmc-chardev.txt | 64 ------- Documentation/fmc/fmc-fakedev.txt | 36 ---- Documentation/fmc/fmc-trivial.txt | 17 -- Documentation/fmc/fmc-write-eeprom.txt | 98 ----------- Documentation/fmc/identifiers.txt | 168 ------------------ Documentation/fmc/mezzanine.txt | 123 ------------- Documentation/fmc/parameters.txt | 56 ------ 10 files changed, 1008 deletions(-) delete mode 100644 Documentation/fmc/API.txt delete mode 100644 Documentation/fmc/FMC-and-SDB.txt delete mode 100644 Documentation/fmc/carrier.txt delete mode 100644 Documentation/fmc/fmc-chardev.txt delete mode 100644 Documentation/fmc/fmc-fakedev.txt delete mode 100644 Documentation/fmc/fmc-trivial.txt delete mode 100644 Documentation/fmc/fmc-write-eeprom.txt delete mode 100644 Documentation/fmc/identifiers.txt delete mode 100644 Documentation/fmc/mezzanine.txt delete mode 100644 Documentation/fmc/parameters.txt (limited to 'Documentation') diff --git a/Documentation/fmc/API.txt b/Documentation/fmc/API.txt deleted file mode 100644 index 06b06b92c794..000000000000 --- a/Documentation/fmc/API.txt +++ /dev/null @@ -1,47 +0,0 @@ -Functions Exported by fmc.ko -**************************** - -The FMC core exports the usual 4 functions that are needed for a bus to -work, and a few more: - - int fmc_driver_register(struct fmc_driver *drv); - void fmc_driver_unregister(struct fmc_driver *drv); - int fmc_device_register(struct fmc_device *fmc); - void fmc_device_unregister(struct fmc_device *fmc); - - int fmc_device_register_n(struct fmc_device **fmc, int n); - void fmc_device_unregister_n(struct fmc_device **fmc, int n); - - uint32_t fmc_readl(struct fmc_device *fmc, int offset); - void fmc_writel(struct fmc_device *fmc, uint32_t val, int off); - void *fmc_get_drvdata(struct fmc_device *fmc); - void fmc_set_drvdata(struct fmc_device *fmc, void *data); - - int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw, - int sdb_entry); - -The data structure that describe a device is detailed in *note FMC -Device::, the one that describes a driver is detailed in *note FMC -Driver::. Please note that structures of type fmc_device must be -allocated by the caller, but must not be released after unregistering. -The fmc-bus itself takes care of releasing the structure when their use -count reaches zero - actually, the device model does that in lieu of us. - -The functions to register and unregister n devices are meant to be used -by carriers that host more than one mezzanine. The devices must all be -registered at the same time because if the FPGA is reprogrammed, all -devices in the array are affected. Usually, the driver matching the -first device will reprogram the FPGA, so other devices must know they -are already driven by a reprogrammed FPGA. - -If a carrier hosts slots that are driven by different FPGA devices, it -should register as a group only mezzanines that are driven by the same -FPGA, for the reason outlined above. - -Finally, the fmc_reprogram function calls the reprogram method (see -*note The API Offered by Carriers:: and also scans the memory area for -an SDB tree. You can pass -1 as sdb_entry to disable such scan. -Otherwise, the function fails if no tree is found at the specified -entry point. The function is meant to factorize common code, and by -the time you read this it is already used by the spec-sw and fine-delay -modules. diff --git a/Documentation/fmc/FMC-and-SDB.txt b/Documentation/fmc/FMC-and-SDB.txt deleted file mode 100644 index fa14e0b24521..000000000000 --- a/Documentation/fmc/FMC-and-SDB.txt +++ /dev/null @@ -1,88 +0,0 @@ - -FMC (FPGA Mezzanine Card) is the standard we use for our I/O devices, -in the context of White Rabbit and related hardware. - -In our I/O environments we need to write drivers for each mezzanine -card, and such drivers must work regardless of the carrier being used. -To achieve this, we abstract the FMC interface. - -We have a carrier for PCI-E called SPEC and one for VME called SVEC, -but more are planned. Also, we support stand-alone devices (usually -plugged on a SPEC card), controlled through Etherbone, developed by GSI. - -Code and documentation for the FMC bus was born as part of the spec-sw -project, but now it lives in its own project. Other projects, i.e. -software support for the various carriers, should include this as a -submodule. - -The most up to date version of code and documentation is always -available from the repository you can clone from: - - git://ohwr.org/fmc-projects/fmc-bus.git (read-only) - git@ohwr.org:fmc-projects/fmc-bus.git (read-write for developers) - -Selected versions of the documentation, as well as complete tar -archives for selected revisions are placed to the Files section of the -project: `http://www.ohwr.org/projects/fmc-bus/files' - - -What is FMC -*********** - -FMC, as said, stands for "FPGA Mezzanine Card". It is a standard -developed by the VME consortium called VITA (VMEbus International Trade -Association and ratified by ANSI, the American National Standard -Institute. The official documentation is called "ANSI-VITA 57.1". - -The FMC card is an almost square PCB, around 70x75 millimeters, that is -called mezzanine in this document. It usually lives plugged into -another PCB for power supply and control; such bigger circuit board is -called carrier from now on, and a single carrier may host more than one -mezzanine. - -In the typical application the mezzanine is mostly analog while the -carrier is mostly digital, and hosts an FPGA that must be configured to -match the specific mezzanine and the desired application. Thus, you may -need to load different FPGA images to drive different instances of the -same mezzanine. - -FMC, as such, is not a bus in the usual meaning of the term, because -most carriers have only one connector, and carriers with several -connectors have completely separate electrical connections to them. -This package, however, implements a bus as a software abstraction. - - -What is SDB -*********** - -SDB (Self Describing Bus) is a set of data structures that we use for -enumerating the internal structure of an FPGA image. We also use it as -a filesystem inside the FMC EEPROM. - -SDB is not mandatory for use of this FMC kernel bus, but if you have SDB -this package can make good use of it. SDB itself is developed in the -fpga-config-space OHWR project. The link to the repository is -`git://ohwr.org/hdl-core-lib/fpga-config-space.git' and what is used in -this project lives in the sdbfs subdirectory in there. - -SDB support for FMC is described in *note FMC Identification:: and -*note SDB Support:: - - -SDB Support -*********** - -The fmc.ko bus driver exports a few functions to help drivers taking -advantage of the SDB information that may be present in your own FPGA -memory image. - -The module exports the following functions, in the special header -. The linux/ prefix in the name is there because we -plan to submit it upstream in the future, and don't want to force -changes on our drivers if that happens. - - int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address); - void fmc_show_sdb_tree(struct fmc_device *fmc); - signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor, - uint32_t device, unsigned long *sz); - int fmc_free_sdb_tree(struct fmc_device *fmc); diff --git a/Documentation/fmc/carrier.txt b/Documentation/fmc/carrier.txt deleted file mode 100644 index 5e4f1dd3e98b..000000000000 --- a/Documentation/fmc/carrier.txt +++ /dev/null @@ -1,311 +0,0 @@ -FMC Device -********** - -Within the Linux bus framework, the FMC device is created and -registered by the carrier driver. For example, the PCI driver for the -SPEC card fills a data structure for each SPEC that it drives, and -registers an associated FMC device for each card. The SVEC driver can -do exactly the same for the VME carrier (actually, it should do it -twice, because the SVEC carries two FMC mezzanines). Similarly, an -Etherbone driver will be able to register its own FMC devices, offering -communication primitives through frame exchange. - -The contents of the EEPROM within the FMC are used for identification -purposes, i.e. for matching the device with its own driver. For this -reason the device structure includes a complete copy of the EEPROM -(actually, the carrier driver may choose whether or not to return it - -for example we most likely won't have the whole EEPROM available for -Etherbone devices. - -The following listing shows the current structure defining a device. -Please note that all the machinery is in place but some details may -still change in the future. For this reason, there is a version field -at the beginning of the structure. As usual, the minor number will -change for compatible changes (like a new flag) and the major number -will increase when an incompatible change happens (for example, a -change in layout of some fmc data structures). Device writers should -just set it to the value FMC_VERSION, and be ready to get back -EINVAL -at registration time. - - struct fmc_device { - unsigned long version; - unsigned long flags; - struct module *owner; /* char device must pin it */ - struct fmc_fru_id id; /* for EEPROM-based match */ - struct fmc_operations *op; /* carrier-provided */ - int irq; /* according to host bus. 0 == none */ - int eeprom_len; /* Usually 8kB, may be less */ - int eeprom_addr; /* 0x50, 0x52 etc */ - uint8_t *eeprom; /* Full contents or leading part */ - char *carrier_name; /* "SPEC" or similar, for special use */ - void *carrier_data; /* "struct spec *" or equivalent */ - __iomem void *fpga_base; /* May be NULL (Etherbone) */ - __iomem void *slot_base; /* Set by the driver */ - struct fmc_device **devarray; /* Allocated by the bus */ - int slot_id; /* Index in the slot array */ - int nr_slots; /* Number of slots in this carrier */ - unsigned long memlen; /* Used for the char device */ - struct device dev; /* For Linux use */ - struct device *hwdev; /* The underlying hardware device */ - unsigned long sdbfs_entry; - struct sdb_array *sdb; - uint32_t device_id; /* Filled by the device */ - char *mezzanine_name; /* Defaults to ``fmc'' */ - void *mezzanine_data; - }; - -The meaning of most fields is summarized in the code comment above. - -The following fields must be filled by the carrier driver before -registration: - - * version: must be set to FMC_VERSION. - - * owner: set to MODULE_OWNER. - - * op: the operations to act on the device. - - * irq: number for the mezzanine; may be zero. - - * eeprom_len: length of the following array. - - * eeprom_addr: 0x50 for first mezzanine and so on. - - * eeprom: the full content of the I2C EEPROM. - - * carrier_name. - - * carrier_data: a unique pointer for the carrier. - - * fpga_base: the I/O memory address (may be NULL). - - * slot_id: the index of this slot (starting from zero). - - * memlen: if fpga_base is valid, the length of I/O memory. - - * hwdev: to be used in some dev_err() calls. - - * device_id: a slot-specific unique integer number. - - -Please note that the carrier should read its own EEPROM memory before -registering the device, as well as fill all other fields listed above. - -The following fields should not be assigned, because they are filled -later by either the bus or the device driver: - - * flags. - - * fru_id: filled by the bus, parsing the eeprom. - - * slot_base: filled and used by the driver, if useful to it. - - * devarray: an array og all mezzanines driven by a singe FPGA. - - * nr_slots: set by the core at registration time. - - * dev: used by Linux. - - * sdb: FPGA contents, scanned according to driver's directions. - - * sdbfs_entry: SDB entry point in EEPROM: autodetected. - - * mezzanine_data: available for the driver. - - * mezzanine_name: filled by fmc-bus during identification. - - -Note: mezzanine_data may be redundant, because Linux offers the drvdata -approach, so the field may be removed in later versions of this bus -implementation. - -As I write this, she SPEC carrier is already completely functional in -the fmc-bus environment, and is a good reference to look at. - - -The API Offered by Carriers -=========================== - -The carrier provides a number of methods by means of the -`fmc_operations' structure, which currently is defined like this -(again, it is a moving target, please refer to the header rather than -this document): - - struct fmc_operations { - uint32_t (*readl)(struct fmc_device *fmc, int offset); - void (*writel)(struct fmc_device *fmc, uint32_t value, int offset); - int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw); - int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv); - int (*irq_request)(struct fmc_device *fmc, irq_handler_t h, - char *name, int flags); - void (*irq_ack)(struct fmc_device *fmc); - int (*irq_free)(struct fmc_device *fmc); - int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio, - int ngpio); - int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l); - int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l); - }; - -The individual methods perform the following tasks: - -`readl' -`writel' - These functions access FPGA registers by whatever means the - carrier offers. They are not expected to fail, and most of the time - they will just make a memory access to the host bus. If the - carrier provides a fpga_base pointer, the driver may use direct - access through that pointer. For this reason the header offers the - inline functions fmc_readl and fmc_writel that access fpga_base if - the respective method is NULL. A driver that wants to be portable - and efficient should use fmc_readl and fmc_writel. For Etherbone, - or other non-local carriers, error-management is still to be - defined. - -`validate' - Module parameters are used to manage different applications for - two or more boards of the same kind. Validation is based on the - busid module parameter, if provided, and returns the matching - index in the associated array. See *note Module Parameters:: in in - doubt. If no match is found, `-ENOENT' is returned; if the user - didn't pass `busid=', all devices will pass validation. The value - returned by the validate method can be used as index into other - parameters (for example, some drivers use the `lm32=' parameter in - this way). Such "generic parameters" are documented in *note - Module Parameters::, below. The validate method is used by - `fmc-trivial.ko', described in *note fmc-trivial::. - -`reprogram' - The carrier enumerates FMC devices by loading a standard (or - golden) FPGA binary that allows EEPROM access. Each driver, then, - will need to reprogram the FPGA by calling this function. If the - name argument is NULL, the carrier should reprogram the golden - binary. If the gateware name has been overridden through module - parameters (in a carrier-specific way) the file loaded will match - the parameters. Per-device gateware names can be specified using - the `gateware=' parameter, see *note Module Parameters::. Note: - Clients should call rhe new helper, fmc_reprogram, which both - calls this method and parse the SDB tree of the FPGA. - -`irq_request' -`irq_ack' -`irq_free' - Interrupt management is carrier-specific, so it is abstracted as - operations. The interrupt number is listed in the device - structure, and for the mezzanine driver the number is only - informative. The handler will receive the fmc pointer as dev_id; - the flags argument is passed to the Linux request_irq function, - but fmc-specific flags may be added in the future. You'll most - likely want to pass the `IRQF_SHARED' flag. - -`gpio_config' - The method allows to configure a GPIO pin in the carrier, and read - its current value if it is configured as input. See *note The GPIO - Abstraction:: for details. - -`read_ee' -`write_ee' - Read or write the EEPROM. The functions are expected to be only - called before reprogramming and the carrier should refuse them - with `ENODEV' after reprogramming. The offset is expected to be - within 8kB (the current size), but addresses up to 1MB are - reserved to fit bigger I2C devices in the future. Carriers may - offer access to other internal flash memories using these same - methods: for example the SPEC driver may define that its carrier - I2C memory is seen at offset 1M and the internal SPI flash is seen - at offset 16M. This multiplexing of several flash memories in the - same address space is carrier-specific and should only be used - by a driver that has verified the `carrier_name' field. - - - -The GPIO Abstraction -==================== - -Support for GPIO pins in the fmc-bus environment is not very -straightforward and deserves special discussion. - -While the general idea of a carrier-independent driver seems to fly, -configuration of specific signals within the carrier needs at least -some knowledge of the carrier itself. For this reason, the specific -driver can request to configure carrier-specific GPIO pins, numbered -from 0 to at most 4095. Configuration is performed by passing a -pointer to an array of struct fmc_gpio items, as well as the length of -the array. This is the data structure: - - struct fmc_gpio { - char *carrier_name; - int gpio; - int _gpio; /* internal use by the carrier */ - int mode; /* GPIOF_DIR_OUT etc, from */ - int irqmode; /* IRQF_TRIGGER_LOW and so on */ - }; - -By specifying a carrier_name for each pin, the driver may access -different pins in different carriers. The gpio_config method is -expected to return the number of pins successfully configured, ignoring -requests for other carriers. However, if no pin is configured (because -no structure at all refers to the current carrier_name), the operation -returns an error so the caller will know that it is running under a -yet-unsupported carrier. - -So, for example, a driver that has been developed and tested on both -the SPEC and the SVEC may request configuration of two different GPIO -pins, and expect one such configuration to succeed - if none succeeds -it most likely means that the current carrier is a still-unknown one. - -If, however, your GPIO pin has a specific known role, you can pass a -special number in the gpio field, using one of the following macros: - - #define FMC_GPIO_RAW(x) (x) /* 4096 of them */ - #define FMC_GPIO_IRQ(x) ((x) + 0x1000) /* 256 of them */ - #define FMC_GPIO_LED(x) ((x) + 0x1100) /* 256 of them */ - #define FMC_GPIO_KEY(x) ((x) + 0x1200) /* 256 of them */ - #define FMC_GPIO_TP(x) ((x) + 0x1300) /* 256 of them */ - #define FMC_GPIO_USER(x) ((x) + 0x1400) /* 256 of them */ - -Use of virtual GPIO numbers (anything but FMC_GPIO_RAW) is allowed -provided the carrier_name field in the data structure is left -unspecified (NULL). Each carrier is responsible for providing a mapping -between virtual and physical GPIO numbers. The carrier may then use the -_gpio field to cache the result of this mapping. - -All carriers must map their I/O lines to the sets above starting from -zero. The SPEC, for example, maps interrupt pins 0 and 1, and test -points 0 through 3 (even if the test points on the PCB are called -5,6,7,8). - -If, for example, a driver requires a free LED and a test point (for a -scope probe to be plugged at some point during development) it may ask -for FMC_GPIO_LED(0) and FMC_GPIO_TP(0). Each carrier will provide -suitable GPIO pins. Clearly, the person running the drivers will know -the order used by the specific carrier driver in assigning leds and -testpoints, so to make a carrier-dependent use of the diagnostic tools. - -In theory, some form of autodetection should be possible: a driver like -the wr-nic (which uses IRQ(1) on the SPEC card) should configure -IRQ(0), make a test with software-generated interrupts and configure -IRQ(1) if the test fails. This probing step should be used because even -if the wr-nic gateware is known to use IRQ1 on the SPEC, the driver -should be carrier-independent and thus use IRQ(0) as a first bet - -actually, the knowledge that IRQ0 may fail is carrier-dependent -information, but using it doesn't make the driver unsuitable for other -carriers. - -The return value of gpio_config is defined as follows: - - * If no pin in the array can be used by the carrier, `-ENODEV'. - - * If at least one virtual GPIO number cannot be mapped, `-ENOENT'. - - * On success, 0 or positive. The value returned is the number of - high input bits (if no input is configured, the value for success - is 0). - -While I admit the procedure is not completely straightforward, it -allows configuration, input and output with a single carrier operation. -Given the typical use case of FMC devices, GPIO operations are not -expected to ever by in hot paths, and GPIO access so fare has only been -used to configure the interrupt pin, mode and polarity. Especially -reading inputs is not expected to be common. If your device has GPIO -capabilities in the hot path, you should consider using the kernel's -GPIO mechanisms. diff --git a/Documentation/fmc/fmc-chardev.txt b/Documentation/fmc/fmc-chardev.txt deleted file mode 100644 index d9ccb278e597..000000000000 --- a/Documentation/fmc/fmc-chardev.txt +++ /dev/null @@ -1,64 +0,0 @@ -fmc-chardev -=========== - -This is a simple generic driver, that allows user access by means of a -character device (actually, one for each mezzanine it takes hold of). - -The char device is created as a misc device. Its name in /dev (as -created by udev) is the same name as the underlying FMC device. Thus, -the name can be a silly fmc-0000 look-alike if the device has no -identifiers nor bus_id, a more specific fmc-0400 if the device has a -bus-specific address but no associated name, or something like -fdelay-0400 if the FMC core can rely on both a mezzanine name and a bus -address. - -Currently the driver only supports read and write: you can lseek to the -desired address and read or write a register. - -The driver assumes all registers are 32-bit in size, and only accepts a -single read or write per system call. However, as a result of Unix read -and write semantics, users can simply fread or fwrite bigger areas in -order to dump or store bigger memory areas. - -There is currently no support for mmap, user-space interrupt management -and DMA buffers. They may be added in later versions, if the need -arises. - -The example below shows raw access to a SPEC card programmed with its -golden FPGA file, that features an SDB structure at offset 256 - i.e. -64 words. The mezzanine's EEPROM in this case is not programmed, so the -default name is fmc-, and there are two cards in the system: - - spusa.root# insmod fmc-chardev.ko - [ 1073.339332] spec 0000:02:00.0: Driver has no ID: matches all - [ 1073.345051] spec 0000:02:00.0: Created misc device "fmc-0200" - [ 1073.350821] spec 0000:04:00.0: Driver has no ID: matches all - [ 1073.356525] spec 0000:04:00.0: Created misc device "fmc-0400" - spusa.root# ls -l /dev/fmc* - crw------- 1 root root 10, 58 Nov 20 19:23 /dev/fmc-0200 - crw------- 1 root root 10, 57 Nov 20 19:23 /dev/fmc-0400 - spusa.root# dd bs=4 skip=64 count=1 if=/dev/fmc-0200 2> /dev/null | od -t x1z - 0000000 2d 42 44 53 >-BDS< - 0000004 - -The simple program tools/fmc-mem in this package can access an FMC char -device and read or write a word or a whole area. Actually, the program -is not specific to FMC at all, it just uses lseek, read and write. - -Its first argument is the device name, the second the offset, the third -(if any) the value to write and the optional last argument that must -begin with "+" is the number of bytes to read or write. In case of -repeated reading data is written to stdout; repeated writes read from -stdin and the value argument is ignored. - -The following examples show reading the SDB magic number and the first -SDB record from a SPEC device programmed with its golden image: - - spusa.root# ./fmc-mem /dev/fmc-0200 100 - 5344422d - spusa.root# ./fmc-mem /dev/fmc-0200 100 +40 | od -Ax -t x1z - 000000 2d 42 44 53 00 01 02 00 00 00 00 00 00 00 00 00 >-BDS............< - 000010 00 00 00 00 ff 01 00 00 00 00 00 00 51 06 00 00 >............Q...< - 000020 c9 42 a5 e6 02 00 00 00 11 05 12 20 2d 34 42 57 >.B......... -4BW< - 000030 73 6f 72 43 72 61 62 73 49 53 47 2d 00 20 20 20 >sorCrabsISG-. < - 000040 diff --git a/Documentation/fmc/fmc-fakedev.txt b/Documentation/fmc/fmc-fakedev.txt deleted file mode 100644 index e85b74a4ae30..000000000000 --- a/Documentation/fmc/fmc-fakedev.txt +++ /dev/null @@ -1,36 +0,0 @@ -fmc-fakedev -=========== - -This package includes a software-only device, called fmc-fakedev, which -is able to register up to 4 mezzanines (by default it registers one). -Unlike the SPEC driver, which creates an FMC device for each PCI cards -it manages, this module creates a single instance of its set of -mezzanines. - -It is meant as the simplest possible example of how a driver should be -written, and it includes a fake EEPROM image (built using the tools -described in *note FMC Identification::),, which by default is -replicated for each fake mezzanine. - -You can also use this device to verify the match algorithms, by asking -it to test your own EEPROM image. You can provide the image by means of -the eeprom= module parameter: the new EEPROM image is loaded, as usual, -by means of the firmware loader. This example shows the defaults and a -custom EEPROM image: - - spusa.root# insmod fmc-fakedev.ko - [ 99.971247] fake-fmc-carrier: mezzanine 0 - [ 99.975393] Manufacturer: fake-vendor - [ 99.979624] Product name: fake-design-for-testing - spusa.root# rmmod fmc-fakedev - spusa.root# insmod fmc-fakedev.ko eeprom=fdelay-eeprom.bin - [ 121.447464] fake-fmc-carrier: Mezzanine 0: eeprom "fdelay-eeprom.bin" - [ 121.462725] fake-fmc-carrier: mezzanine 0 - [ 121.466858] Manufacturer: CERN - [ 121.470477] Product name: FmcDelay1ns4cha - spusa.root# rmmod fmc-fakedev - -After loading the device, you can use the write_ee method do modify its -own internal fake EEPROM: whenever the image is overwritten starting at -offset 0, the module will unregister and register again the FMC device. -This is shown in fmc-write-eeprom.txt diff --git a/Documentation/fmc/fmc-trivial.txt b/Documentation/fmc/fmc-trivial.txt deleted file mode 100644 index d1910bc67159..000000000000 --- a/Documentation/fmc/fmc-trivial.txt +++ /dev/null @@ -1,17 +0,0 @@ -fmc-trivial -=========== - -The simple module fmc-trivial is just a simple client that registers an -interrupt handler. I used it to verify the basic mechanism of the FMC -bus and how interrupts worked. - -The module implements the generic FMC parameters, so it can program a -different gateware file in each card. The whole list of parameters it -accepts are: - -`busid=' -`gateware=' - Generic parameters. See mezzanine.txt - - -This driver is worth reading, in my opinion. diff --git a/Documentation/fmc/fmc-write-eeprom.txt b/Documentation/fmc/fmc-write-eeprom.txt deleted file mode 100644 index e0a9712156aa..000000000000 --- a/Documentation/fmc/fmc-write-eeprom.txt +++ /dev/null @@ -1,98 +0,0 @@ -fmc-write-eeprom -================ - -This module is designed to load a binary file from /lib/firmware and to -write it to the internal EEPROM of the mezzanine card. This driver uses -the `busid' generic parameter. - -Overwriting the EEPROM is not something you should do daily, and it is -expected to only happen during manufacturing. For this reason, the -module makes it unlikely for the random user to change a working EEPROM. - -However, since the EEPROM may include application-specific information -other than the identification, later versions of this packages added -write-support through sysfs. See *note Accessing the EEPROM::. - -To avoid damaging the EEPROM content, the module takes the following -measures: - - * It accepts a `file=' argument (within /lib/firmware) and if no - such argument is received, it doesn't write anything to EEPROM - (i.e. there is no default file name). - - * If the file name ends with `.bin' it is written verbatim starting - at offset 0. - - * If the file name ends with `.tlv' it is interpreted as - type-length-value (i.e., it allows writev(2)-like operation). - - * If the file name doesn't match any of the patterns above, it is - ignored and no write is performed. - - * Only cards listed with `busid=' are written to. If no busid is - specified, no programming is done (and the probe function of the - driver will fail). - - -Each TLV tuple is formatted in this way: the header is 5 bytes, -followed by data. The first byte is `w' for write, the next two bytes -represent the address, in little-endian byte order, and the next two -represent the data length, in little-endian order. The length does not -include the header (it is the actual number of bytes to be written). - -This is a real example: that writes 5 bytes at position 0x110: - - spusa.root# od -t x1 -Ax /lib/firmware/try.tlv - 000000 77 10 01 05 00 30 31 32 33 34 - 00000a - spusa.root# insmod /tmp/fmc-write-eeprom.ko busid=0x0200 file=try.tlv - [19983.391498] spec 0000:03:00.0: write 5 bytes at 0x0110 - [19983.414615] spec 0000:03:00.0: write_eeprom: success - -Please note that you'll most likely want to use SDBFS to build your -EEPROM image, at least if your mezzanines are being used in the White -Rabbit environment. For this reason the TLV format is not expected to -be used much and is not expected to be developed further. - -If you want to try reflashing fake EEPROM devices, you can use the -fmc-fakedev.ko module (see *note fmc-fakedev::). Whenever you change -the image starting at offset 0, it will deregister and register again -after two seconds. Please note, however, that if fmc-write-eeprom is -still loaded, the system will associate it to the new device, which -will be reprogrammed and thus will be unloaded after two seconds. The -following example removes the module after it reflashed fakedev the -first time. - - spusa.root# insmod fmc-fakedev.ko - [ 72.984733] fake-fmc: Manufacturer: fake-vendor - [ 72.989434] fake-fmc: Product name: fake-design-for-testing - spusa.root# insmod fmc-write-eeprom.ko busid=0 file=fdelay-eeprom.bin; \ - rmmod fmc-write-eeprom - [ 130.874098] fake-fmc: Matching a generic driver (no ID) - [ 130.887845] fake-fmc: programming 6155 bytes - [ 130.894567] fake-fmc: write_eeprom: success - [ 132.895794] fake-fmc: Manufacturer: CERN - [ 132.899872] fake-fmc: Product name: FmcDelay1ns4cha - - -Accessing the EEPROM -===================== - -The bus creates a sysfs binary file called eeprom for each mezzanine it -knows about: - - spusa.root# cd /sys/bus/fmc/devices; ls -l */eeprom - -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcAdc100m14b4cha-0800/eeprom - -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDelay1ns4cha-0200/eeprom - -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDio5cha-0400/eeprom - -Everybody can read the files and the superuser can also modify it, but -the operation may on the carrier driver, if the carrier is unable to -access the I2C bus. For example, the spec driver can access the bus -only with its golden gateware: after a mezzanine driver reprogrammed -the FPGA with a custom circuit, the carrier is unable to access the -EEPROM and returns ENOTSUPP. - -An alternative way to write the EEPROM is the mezzanine driver -fmc-write-eeprom (See *note fmc-write-eeprom::), but the procedure is -more complex. diff --git a/Documentation/fmc/identifiers.txt b/Documentation/fmc/identifiers.txt deleted file mode 100644 index 3bb577ff0d52..000000000000 --- a/Documentation/fmc/identifiers.txt +++ /dev/null @@ -1,168 +0,0 @@ -FMC Identification -****************** - -The FMC standard requires every compliant mezzanine to carry -identification information in an I2C EEPROM. The information must be -laid out according to the "IPMI Platform Management FRU Information", -where IPMI is a lie I'd better not expand, and FRU means "Field -Replaceable Unit". - -The FRU information is an intricate unreadable binary blob that must -live at offset 0 of the EEPROM, and typically extends for a few hundred -bytes. The standard allows the application to use all the remaining -storage area of the EEPROM as it wants. - -This chapter explains how to create your own EEPROM image and how to -write it in your mezzanine, as well as how devices and drivers are -paired at run time. EEPROM programming uses tools that are part of this -package and SDB (part of the fpga-config-space package). - -The first sections are only interesting for manufacturers who need to -write the EEPROM. If you are just a software developer writing an FMC -device or driver, you may jump straight to *note SDB Support::. - - -Building the FRU Structure -========================== - -If you want to know the internals of the FRU structure and despair, you -can retrieve the document from -`http://download.intel.com/design/servers/ipmi/FRU1011.pdf' . The -standard is awful and difficult without reason, so we only support the -minimum mandatory subset - we create a simple structure and parse it -back at run time, but we are not able to either generate or parse more -arcane features like non-english languages and 6-bit text. If you need -more items of the FRU standard for your boards, please submit patches. - -This package includes the Python script that Matthieu Cattin wrote to -generate the FRU binary blob, based on an helper libipmi by Manohar -Vanga and Matthieu himself. I changed the test script to receive -parameters from the command line or from the environment (the command -line takes precedence) - -To make a long story short, in order to build a standard-compliant -binary file to be burned in your EEPROM, you need the following items: - - Environment Opt Official Name Default ---------------------------------------------------------------------- - FRU_VENDOR -v "Board Manufacturer" fmc-example - FRU_NAME -n "Board Product Name" mezzanine - FRU_SERIAL -s `Board Serial Number" 0001 - FRU_PART -p "Board Part Number" sample-part - FRU_OUTPUT -o not applicable /dev/stdout - -The "Official Name" above is what you find in the FRU official -documentation, chapter 11, page 7 ("Board Info Area Format"). The -output option is used to save the generated binary to a specific file -name instead of stdout. - -You can pass the items to the FRU generator either in the environment -or on the command line. This package has currently no support for -specifying power consumption or such stuff, but I plan to add it as -soon as I find some time for that. - -FIXME: consumption etc for FRU are here or in PTS? - -The following example creates a binary image for a specific board: - - ./tools/fru-generator -v CERN -n FmcAdc100m14b4cha \ - -s HCCFFIA___-CR000003 -p EDA-02063-V5-0 > eeprom.bin - -The following example shows a script that builds several binary EEPROM -images for a series of boards, changing the serial number for each of -them. The script uses a mix of environment variables and command line -options, and uses the same string patterns shown above. - - #!/bin/sh - - export FRU_VENDOR="CERN" - export FRU_NAME="FmcAdc100m14b4cha" - export FRU_PART="EDA-02063-V5-0" - - serial="HCCFFIA___-CR" - - for number in $(seq 1 50); do - # build number-string "ns" - ns="$(printf %06d $number)" - ./fru-generator -s "${serial}${ns}" > eeprom-${ns}.bin - done - - -Using SDB-FS in the EEPROM -========================== - -If you want to use SDB as a filesystem in the EEPROM device within the -mezzanine, you should create one such filesystem using gensdbfs, from -the fpga-config-space package on OHWR. - -By using an SBD filesystem you can cluster several files in a single -EEPROM, so both the host system and a soft-core running in the FPGA (if -any) can access extra production-time information. - -We chose to use SDB as a storage filesystem because the format is very -simple, and both the host system and the soft-core will likely already -include support code for such format. The SDB library offered by the -fpga-config-space is less than 1kB under LM32, so it proves quite up to -the task. - -The SDB entry point (which acts as a directory listing) cannot live at -offset zero in the flash device, because the FRU information must live -there. To avoid wasting precious storage space while still allowing -for more-than-minimal FRU structures, the fmc.ko will look for the SDB -record at address 256, 512 and 1024. - -In order to generate the complete EEPROM image you'll need a -configuration file for gensdbfs: you tell the program where to place -the sdb entry point, and you must force the FRU data file to be placed -at the beginning of the storage device. If needed, you can also place -other files at a special offset (we sometimes do it for backward -compatibility with drivers we wrote before implementing SDB for flash -memory). - -The directory tools/sdbfs of this package includes a well-commented -example that you may want to use as a starting point (the comments are -in the file called -SDB-CONFIG-). Reading documentation for gensdbfs -is a suggested first step anyways. - -This package (generic FMC bus support) only accesses two files in the -EEPROM: the FRU information, at offset zero, with a suggested filename -of IPMI-FRU and the short name for the mezzanine, in a file called -name. The IPMI-FRU name is not mandatory, but a strongly suggested -choice; the name filename is mandatory, because this is the preferred -short name used by the FMC core. For example, a name of "fdelay" may -supplement a Product Name like "FmcDelay1ns4cha" - exactly as -demonstrated in `tools/sdbfs'. - -Note: SDB access to flash memory is not yet supported, so the short -name currently in use is just the "Product Name" FRU string. - -The example in tools/sdbfs includes an extra file, that is needed by -the fine-delay driver, and must live at a known address of 0x1800. By -running gensdbfs on that directory you can output your binary EEPROM -image (here below spusa$ is the shell prompt): - - spusa$ ../fru-generator -v CERN -n FmcDelay1ns4cha -s proto-0 \ - -p EDA-02267-V3 > IPMI-FRU - spusa$ ls -l - total 16 - -rw-rw-r-- 1 rubini staff 975 Nov 19 18:08 --SDB-CONFIG-- - -rw-rw-r-- 1 rubini staff 216 Nov 19 18:13 IPMI-FRU - -rw-rw-r-- 1 rubini staff 11 Nov 19 18:04 fd-calib - -rw-rw-r-- 1 rubini staff 7 Nov 19 18:04 name - spusa$ sudo gensdbfs . /lib/firmware/fdelay-eeprom.bin - spusa$ sdb-read -l -e 0x100 /lib/firmware/fdelay-eeprom.bin - /home/rubini/wip/sdbfs/userspace/sdb-read: listing format is to be defined - 46696c6544617461:2e202020 00000100-000018ff . - 46696c6544617461:6e616d65 00000200-00000206 name - 46696c6544617461:66642d63 00001800-000018ff fd-calib - 46696c6544617461:49504d49 00000000-000000d7 IPMI-FRU - spusa$ ../fru-dump /lib/firmware/fdelay-eeprom.bin - /lib/firmware/fdelay-eeprom.bin: manufacturer: CERN - /lib/firmware/fdelay-eeprom.bin: product-name: FmcDelay1ns4cha - /lib/firmware/fdelay-eeprom.bin: serial-number: proto-0 - /lib/firmware/fdelay-eeprom.bin: part-number: EDA-02267-V3 - -As expected, the output file is both a proper sdbfs object and an IPMI -FRU information blob. The fd-calib file lives at offset 0x1800 and is -over-allocated to 256 bytes, according to the configuration file for -gensdbfs. diff --git a/Documentation/fmc/mezzanine.txt b/Documentation/fmc/mezzanine.txt deleted file mode 100644 index 87910dbfc91e..000000000000 --- a/Documentation/fmc/mezzanine.txt +++ /dev/null @@ -1,123 +0,0 @@ -FMC Driver -********** - -An FMC driver is concerned with the specific mezzanine and associated -gateware. As such, it is expected to be independent of the carrier -being used: it will perform I/O accesses only by means of -carrier-provided functions. - -The matching between device and driver is based on the content of the -EEPROM (as mandated by the FMC standard) or by the actual cores -configured in the FPGA; the latter technique is used when the FPGA is -already programmed when the device is registered to the bus core. - -In some special cases it is possible for a driver to directly access -FPGA registers, by means of the `fpga_base' field of the device -structure. This may be needed for high-bandwidth peripherals like fast -ADC cards. If the device module registered a remote device (for example -by means of Etherbone), the `fpga_base' pointer will be NULL. -Therefore, drivers must be ready to deal with NULL base pointers, and -fail gracefully. Most driver, however, are not expected to access the -pointer directly but run fmc_readl and fmc_writel instead, which will -work in any case. - -In even more special cases, the driver may access carrier-specific -functionality: the `carrier_name' string allows the driver to check -which is the current carrier and make use of the `carrier_data' -pointer. We chose to use carrier names rather than numeric identifiers -for greater flexibility, but also to avoid a central registry within -the `fmc.h' file - we hope other users will exploit our framework with -their own carriers. An example use of carrier names is in GPIO setup -(see *note The GPIO Abstraction::), although the name match is not -expected to be performed by the driver. If you depend on specific -carriers, please check the carrier name and fail gracefully if your -driver finds it is running in a yet-unknown-to-it environment. - - -ID Table -======== - -Like most other Linux drivers, and FMC driver must list all the devices -which it is able to drive. This is usually done by means of a device -table, but in FMC we can match hardware based either on the contents of -their EEPROM or on the actual FPGA cores that can be enumerated. -Therefore, we have two tables of identifiers. - -Matching of FRU information depends on two names, the manufacturer (or -vendor) and the device (see *note FMC Identification::); for -flexibility during production (i.e. before writing to the EEPROM) the -bus supports a catch-all driver that specifies NULL strings. For this -reason, the table is specified as pointer-and-length, not a a -null-terminated array - the entry with NULL names can be a valid entry. - -Matching on FPGA cores depends on two numeric fields: the 64-bit vendor -number and the 32-bit device number. Support for matching based on -class is not yet implemented. Each device is expected to be uniquely -identified by an array of cores (it matches if all of the cores are -instantiated), and for consistency the list is passed as -pointer-and-length. Several similar devices can be driven by the same -driver, and thus the driver specifies and array of such arrays. - -The complete set of involved data structures is thus the following: - - struct fmc_fru_id { char *manufacturer; char *product_name; }; - struct fmc_sdb_one_id { uint64_t vendor; uint32_t device; }; - struct fmc_sdb_id { struct fmc_sdb_one_id *cores; int cores_nr; }; - - struct fmc_device_id { - struct fmc_fru_id *fru_id; int fru_id_nr; - struct fmc_sdb_id *sdb_id; int sdb_id_nr; - }; - -A better reference, with full explanation, is the header. - - -Module Parameters -================= - -Most of the FMC drivers need the same set of kernel parameters. This -package includes support to implement common parameters by means of -fields in the `fmc_driver' structure and simple macro definitions. - -The parameters are carrier-specific, in that they rely on the busid -concept, that varies among carriers. For the SPEC, the identifier is a -PCI bus and devfn number, 16 bits wide in total; drivers for other -carriers will most likely offer something similar but not identical, -and some code duplication is unavoidable. - -This is the list of parameters that are common to several modules to -see how they are actually used, please look at spec-trivial.c. - -`busid=' - This is an array of integers, listing carrier-specific - identification numbers. For PIC, for example, `0x0400' represents - bus 4, slot 0. If any such ID is specified, the driver will only - accept to drive cards that appear in the list (even if the FMC ID - matches). This is accomplished by the validate carrier method. - -`gateware=' - The argument is an array of strings. If no busid= is specified, - the first string of gateware= is used for all cards; otherwise the - identifiers and gateware names are paired one by one, in the order - specified. - -`show_sdb=' - For modules supporting it, this parameter asks to show the SDB - internal structure by means of kernel messages. It is disabled by - default because those lines tend to hide more important messages, - if you look at the system console while loading the drivers. - Note: the parameter is being obsoleted, because fmc.ko itself now - supports dump_sdb= that applies to every client driver. - - -For example, if you are using the trivial driver to load two different -gateware files to two different cards, you can use the following -parameters to load different binaries to the cards, after looking up -the PCI identifiers. This has been tested with a SPEC carrier. - - insmod fmc-trivial.ko \ - busid=0x0200,0x0400 \ - gateware=fmc/fine-delay.bin,fmc/simple-dio.bin - -Please note that not all sub-modules support all of those parameters. -You can use modinfo to check what is supported by each module. diff --git a/Documentation/fmc/parameters.txt b/Documentation/fmc/parameters.txt deleted file mode 100644 index 59edf088e3a4..000000000000 --- a/Documentation/fmc/parameters.txt +++ /dev/null @@ -1,56 +0,0 @@ -Module Parameters in fmc.ko -*************************** - -The core driver receives two module parameters, meant to help debugging -client modules. Both parameters can be modified by writing to -/sys/module/fmc/parameters/, because they are used when client drivers -are devices are registered, not when fmc.ko is loaded. - -`dump_eeprom=' - If not zero, the parameter asks the bus controller to dump the - EEPROM of any device that is registered, using printk. - -`dump_sdb=' - If not zero, the parameter prints the SDB tree of every FPGA it is - loaded by fmc_reprogram(). If greater than one, it asks to dump - the binary content of SDB records. This currently only dumps the - top-level SDB array, though. - - -EEPROM dumping avoids repeating lines, since most of the contents is -usually empty and all bits are one or zero. This is an example of the -output: - - [ 6625.850480] spec 0000:02:00.0: FPGA programming successful - [ 6626.139949] spec 0000:02:00.0: Manufacturer: CERN - [ 6626.144666] spec 0000:02:00.0: Product name: FmcDelay1ns4cha - [ 6626.150370] FMC: mezzanine 0: 0000:02:00.0 on SPEC - [ 6626.155179] FMC: dumping eeprom 0x2000 (8192) bytes - [ 6626.160087] 0000: 01 00 00 01 00 0b 00 f3 01 0a 00 a5 85 87 c4 43 - [ 6626.167069] 0010: 45 52 4e cf 46 6d 63 44 65 6c 61 79 31 6e 73 34 - [ 6626.174019] 0020: 63 68 61 c7 70 72 6f 74 6f 2d 30 cc 45 44 41 2d - [ 6626.180975] 0030: 30 32 32 36 37 2d 56 33 da 32 30 31 32 2d 31 31 - [...] - [ 6626.371366] 0200: 66 64 65 6c 61 79 0a 00 00 00 00 00 00 00 00 00 - [ 6626.378359] 0210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - [ 6626.385361] [...] - [ 6626.387308] 1800: 70 6c 61 63 65 68 6f 6c 64 65 72 ff ff ff ff ff - [ 6626.394259] 1810: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff - [ 6626.401250] [...] - -The dump of SDB looks like the following; the example shows the simple -golden gateware for the SPEC card, removing the leading timestamps to -fit the page: - - spec 0000:02:00.0: SDB: 00000651:e6a542c9 WB4-Crossbar-GSI - spec 0000:02:00.0: SDB: 0000ce42:ff07fc47 WR-Periph-Syscon (00000000-000000ff) - FMC: mezzanine 0: 0000:02:00.0 on SPEC - FMC: poor dump of sdb first level: - 0000: 53 44 42 2d 00 02 01 00 00 00 00 00 00 00 00 00 - 0010: 00 00 00 00 00 00 01 ff 00 00 00 00 00 00 06 51 - 0020: e6 a5 42 c9 00 00 00 02 20 12 05 11 57 42 34 2d - 0030: 43 72 6f 73 73 62 61 72 2d 47 53 49 20 20 20 00 - 0040: 00 00 01 01 00 00 00 07 00 00 00 00 00 00 00 00 - 0050: 00 00 00 00 00 00 00 ff 00 00 00 00 00 00 ce 42 - 0060: ff 07 fc 47 00 00 00 01 20 12 03 05 57 52 2d 50 - 0070: 65 72 69 70 68 2d 53 79 73 63 6f 6e 20 20 20 01 -- cgit From b7e47f48f1197c24f5347327afc2a4f7f6da9ca5 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 11 Jun 2019 15:29:40 +0300 Subject: bindings: sound: davinci-mcasp: Add support for optional auxclk-fs-ratio When McASP is bus master it's reference clock (AUXCLK) might not be a static clock, but running at a specific FS ratio. Signed-off-by: Peter Ujfalusi Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt index a58f79f5345c..c483dcec01f8 100644 --- a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt +++ b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt @@ -44,6 +44,9 @@ Optional properties: please refer to pinctrl-bindings.txt - fck_parent : Should contain a valid clock name which will be used as parent for the McASP fck +- auxclk-fs-ratio: When McASP is bus master indicates the ratio between AUCLK + and FS rate if applicable: + AUCLK rate = auxclk-fs-ratio * FS rate Optional GPIO support: If any McASP pin need to be used as GPIO then the McASP node must have: -- cgit From 0114214eca21e85d66a5e206f803db42d1d07960 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 12 Jun 2019 14:25:27 +0200 Subject: dt-bindings: net: wiznet: add w5x00 support Add bindings for Wiznet's w5x00 series of SPI interfaced Ethernet chips. Based on the bindings for microchip,enc28j60. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: David S. Miller --- .../devicetree/bindings/net/wiznet,w5x00.txt | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/wiznet,w5x00.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/wiznet,w5x00.txt b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt new file mode 100644 index 000000000000..e9665798c4be --- /dev/null +++ b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt @@ -0,0 +1,50 @@ +* Wiznet w5x00 + +This is a standalone 10/100 MBit Ethernet controller with SPI interface. + +For each device connected to a SPI bus, define a child node within +the SPI master node. + +Required properties: +- compatible: Should be one of the following strings: + "wiznet,w5100" + "wiznet,w5200" + "wiznet,w5500" +- reg: Specify the SPI chip select the chip is wired to. +- interrupts: Specify the interrupt index within the interrupt controller (referred + to above in interrupt-parent) and interrupt type. w5x00 natively + generates falling edge interrupts, however, additional board logic + might invert the signal. +- pinctrl-names: List of assigned state names, see pinctrl binding documentation. +- pinctrl-0: List of phandles to configure the GPIO pin used as interrupt line, + see also generic and your platform specific pinctrl binding + documentation. + +Optional properties: +- spi-max-frequency: Maximum frequency of the SPI bus when accessing the w5500. + According to the w5500 datasheet, the chip allows a maximum of 80 MHz, however, + board designs may need to limit this value. +- local-mac-address: See ethernet.txt in the same directory. + + +Example (for Raspberry Pi with pin control stuff for GPIO irq): + +&spi { + ethernet@0: w5500@0 { + compatible = "wiznet,w5500"; + reg = <0>; + pinctrl-names = "default"; + pinctrl-0 = <ð1_pins>; + interrupt-parent = <&gpio>; + interrupts = <25 IRQ_TYPE_EDGE_FALLING>; + spi-max-frequency = <30000000>; + }; +}; + +&gpio { + eth1_pins: eth1_pins { + brcm,pins = <25>; + brcm,function = <0>; /* in */ + brcm,pull = <0>; /* none */ + }; +}; -- cgit From bb2e05e0c8dcc2c93a97410ce362da998b83155f Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 12 Jun 2019 21:29:34 +0900 Subject: linux-next: DOC: RDS: Fix a typo in rds.txt This patch fixes a spelling typo in rds.txt Signed-off-by: Masanari Iida Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- Documentation/networking/rds.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt index 0235ae69af2a..f2a0147c933d 100644 --- a/Documentation/networking/rds.txt +++ b/Documentation/networking/rds.txt @@ -389,7 +389,7 @@ Multipath RDS (mprds) a common (to all paths) part, and a per-path struct rds_conn_path. All I/O workqs and reconnect threads are driven from the rds_conn_path. Transports such as TCP that are multipath capable may then set up a - TPC socket per rds_conn_path, and this is managed by the transport via + TCP socket per rds_conn_path, and this is managed by the transport via the transport privatee cp_transport_data pointer. Transports announce themselves as multipath capable by setting the -- cgit From 03d66cfa2ad62cd0bb4c9aabc5a28d56af41f0cb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 2 Jun 2019 22:44:08 -0700 Subject: crypto: doc - improve the skcipher API example code Rewrite the skcipher API example, changing it to encrypt a buffer with AES-256-XTS. This addresses various problems with the previous example: - It requests a specific driver "cbc-aes-aesni", which is unusual. Normally users ask for "cbc(aes)", not a specific driver. - It encrypts only a single AES block. For the reader, that doesn't clearly distinguish the "skcipher" API from the "cipher" API. - Showing how to encrypt something with bare CBC is arguably a poor choice of example, as it doesn't follow modern crypto trends. Now, usually authenticated encryption is recommended, in which case the user would use the AEAD API, not skcipher. Disk encryption is still a legitimate use for skcipher, but for that usually XTS is recommended. - Many other bugs and poor coding practices, such as not setting CRYPTO_TFM_REQ_MAY_SLEEP, unnecessarily allocating a heap buffer for the IV, unnecessary NULL checks, using a pointless wrapper struct, and forgetting to set an error code in one case. Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- Documentation/crypto/api-samples.rst | 176 +++++++++++++++-------------------- 1 file changed, 77 insertions(+), 99 deletions(-) (limited to 'Documentation') diff --git a/Documentation/crypto/api-samples.rst b/Documentation/crypto/api-samples.rst index f14afaaf2f32..e923f17bc2bd 100644 --- a/Documentation/crypto/api-samples.rst +++ b/Documentation/crypto/api-samples.rst @@ -4,111 +4,89 @@ Code Examples Code Example For Symmetric Key Cipher Operation ----------------------------------------------- -:: - - - /* tie all data structures together */ - struct skcipher_def { - struct scatterlist sg; - struct crypto_skcipher *tfm; - struct skcipher_request *req; - struct crypto_wait wait; - }; - - /* Perform cipher operation */ - static unsigned int test_skcipher_encdec(struct skcipher_def *sk, - int enc) - { - int rc; - - if (enc) - rc = crypto_wait_req(crypto_skcipher_encrypt(sk->req), &sk->wait); - else - rc = crypto_wait_req(crypto_skcipher_decrypt(sk->req), &sk->wait); - - if (rc) - pr_info("skcipher encrypt returned with result %d\n", rc); +This code encrypts some data with AES-256-XTS. For sake of example, +all inputs are random bytes, the encryption is done in-place, and it's +assumed the code is running in a context where it can sleep. - return rc; - } +:: - /* Initialize and trigger cipher operation */ static int test_skcipher(void) { - struct skcipher_def sk; - struct crypto_skcipher *skcipher = NULL; - struct skcipher_request *req = NULL; - char *scratchpad = NULL; - char *ivdata = NULL; - unsigned char key[32]; - int ret = -EFAULT; - - skcipher = crypto_alloc_skcipher("cbc-aes-aesni", 0, 0); - if (IS_ERR(skcipher)) { - pr_info("could not allocate skcipher handle\n"); - return PTR_ERR(skcipher); - } - - req = skcipher_request_alloc(skcipher, GFP_KERNEL); - if (!req) { - pr_info("could not allocate skcipher request\n"); - ret = -ENOMEM; - goto out; - } - - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, - &sk.wait); - - /* AES 256 with random key */ - get_random_bytes(&key, 32); - if (crypto_skcipher_setkey(skcipher, key, 32)) { - pr_info("key could not be set\n"); - ret = -EAGAIN; - goto out; - } - - /* IV will be random */ - ivdata = kmalloc(16, GFP_KERNEL); - if (!ivdata) { - pr_info("could not allocate ivdata\n"); - goto out; - } - get_random_bytes(ivdata, 16); - - /* Input data will be random */ - scratchpad = kmalloc(16, GFP_KERNEL); - if (!scratchpad) { - pr_info("could not allocate scratchpad\n"); - goto out; - } - get_random_bytes(scratchpad, 16); - - sk.tfm = skcipher; - sk.req = req; - - /* We encrypt one block */ - sg_init_one(&sk.sg, scratchpad, 16); - skcipher_request_set_crypt(req, &sk.sg, &sk.sg, 16, ivdata); - crypto_init_wait(&sk.wait); - - /* encrypt data */ - ret = test_skcipher_encdec(&sk, 1); - if (ret) - goto out; - - pr_info("Encryption triggered successfully\n"); - + struct crypto_skcipher *tfm = NULL; + struct skcipher_request *req = NULL; + u8 *data = NULL; + const size_t datasize = 512; /* data size in bytes */ + struct scatterlist sg; + DECLARE_CRYPTO_WAIT(wait); + u8 iv[16]; /* AES-256-XTS takes a 16-byte IV */ + u8 key[64]; /* AES-256-XTS takes a 64-byte key */ + int err; + + /* + * Allocate a tfm (a transformation object) and set the key. + * + * In real-world use, a tfm and key are typically used for many + * encryption/decryption operations. But in this example, we'll just do a + * single encryption operation with it (which is not very efficient). + */ + + tfm = crypto_alloc_skcipher("xts(aes)", 0, 0); + if (IS_ERR(tfm)) { + pr_err("Error allocating xts(aes) handle: %ld\n", PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + + get_random_bytes(key, sizeof(key)); + err = crypto_skcipher_setkey(tfm, key, sizeof(key)); + if (err) { + pr_err("Error setting key: %d\n", err); + goto out; + } + + /* Allocate a request object */ + req = skcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) { + err = -ENOMEM; + goto out; + } + + /* Prepare the input data */ + data = kmalloc(datasize, GFP_KERNEL); + if (!data) { + err = -ENOMEM; + goto out; + } + get_random_bytes(data, datasize); + + /* Initialize the IV */ + get_random_bytes(iv, sizeof(iv)); + + /* + * Encrypt the data in-place. + * + * For simplicity, in this example we wait for the request to complete + * before proceeding, even if the underlying implementation is asynchronous. + * + * To decrypt instead of encrypt, just change crypto_skcipher_encrypt() to + * crypto_skcipher_decrypt(). + */ + sg_init_one(&sg, data, datasize); + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + skcipher_request_set_crypt(req, &sg, &sg, datasize, iv); + err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (err) { + pr_err("Error encrypting data: %d\n", err); + goto out; + } + + pr_debug("Encryption was successful\n"); out: - if (skcipher) - crypto_free_skcipher(skcipher); - if (req) + crypto_free_skcipher(tfm); skcipher_request_free(req); - if (ivdata) - kfree(ivdata); - if (scratchpad) - kfree(scratchpad); - return ret; + kfree(data); + return err; } -- cgit From e6aacf9a52e0b013835c7e33538bcdf0703bbe2e Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Thu, 9 May 2019 20:20:18 +0100 Subject: dt-bindings: can: rcar_can: Fix RZ/G2 CAN clocks According to the latest information, the clock options for CAN on RZ/G2 are the same as the ones available on R-Car Gen3 Fixes: 868b7c0f43e6 ("dt-bindings: can: rcar_can: Add r8a774a1 support") Signed-off-by: Fabrizio Castro Reviewed-by: Chris Paterson Reviewed-by: Rob Herring Acked-by: David S. Miller Signed-off-by: Simon Horman --- Documentation/devicetree/bindings/net/can/rcar_can.txt | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt index 9936b9ee67c3..e0dfc7c35fc5 100644 --- a/Documentation/devicetree/bindings/net/can/rcar_can.txt +++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt @@ -27,13 +27,8 @@ Required properties: - reg: physical base address and size of the R-Car CAN register map. - interrupts: interrupt specifier for the sole interrupt. -- clocks: phandles and clock specifiers for 2 CAN clock inputs for RZ/G2 - devices. - phandles and clock specifiers for 3 CAN clock inputs for every other - SoC. -- clock-names: 2 clock input name strings for RZ/G2: "clkp1", "can_clk". - 3 clock input name strings for every other SoC: "clkp1", "clkp2", - "can_clk". +- clocks: phandles and clock specifiers for 3 CAN clock inputs. +- clock-names: 3 clock input name strings: "clkp1", "clkp2", and "can_clk". - pinctrl-0: pin control group to be used for this controller. - pinctrl-names: must be "default". @@ -49,8 +44,7 @@ using the below properties: Optional properties: - renesas,can-clock-select: R-Car CAN Clock Source Select. Valid values are: <0x0> (default) : Peripheral clock (clkp1) - <0x1> : Peripheral clock (clkp2) (not supported by - RZ/G2 devices) + <0x1> : Peripheral clock (clkp2) <0x3> : External input clock Example -- cgit From d703a52eb1ebeeba42f2385bbe84f1dc86f4353c Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Thu, 9 May 2019 20:20:19 +0100 Subject: dt-bindings: can: rcar_can: Add r8a774c0 support Document RZ/G2E (r8a774c0) SoC specific bindings. Signed-off-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Reviewed-by: Rob Herring Acked-by: David S. Miller Signed-off-by: Simon Horman --- Documentation/devicetree/bindings/net/can/rcar_can.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt index e0dfc7c35fc5..b463e1268ac4 100644 --- a/Documentation/devicetree/bindings/net/can/rcar_can.txt +++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt @@ -6,6 +6,7 @@ Required properties: "renesas,can-r8a7744" if CAN controller is a part of R8A7744 SoC. "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC. "renesas,can-r8a774a1" if CAN controller is a part of R8A774A1 SoC. + "renesas,can-r8a774c0" if CAN controller is a part of R8A774C0 SoC. "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC. "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC. "renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC. -- cgit From 2b37c1c3e7bbfaaa3910ee66f44f5a4138229884 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 1 May 2019 18:43:26 +0000 Subject: dt-bindings: perf: imx8-ddr: add imx8qxp ddr performance monitor Add binding documentation for the imx8qxp DDR performance monitor unit. Signed-off-by: Frank Li Reviewed-by: Rob Herring [will: Removed trailing newline] Signed-off-by: Will Deacon --- .../devicetree/bindings/perf/fsl-imx-ddr.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt new file mode 100644 index 000000000000..d77e3f26f9e6 --- /dev/null +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt @@ -0,0 +1,21 @@ +* Freescale(NXP) IMX8 DDR performance monitor + +Required properties: + +- compatible: should be one of: + "fsl,imx8-ddr-pmu" + "fsl,imx8m-ddr-pmu" + +- reg: physical address and size + +- interrupts: single interrupt + generated by the control block + +Example: + + ddr-pmu@5c020000 { + compatible = "fsl,imx8-ddr-pmu"; + reg = <0x5c020000 0x10000>; + interrupt-parent = <&gic>; + interrupts = ; + }; -- cgit From d9c53aa440b332059f7f0ce3f7868ff1dc58c62c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:31:00 -0600 Subject: NTB: Describe the ntb_msi_test client in the documentation. Add a blurb in Documentation/ntb.txt to describe the ntb_msi_test tool's debugfs interface. Similar to the (out of date) ntb_tool description. Signed-off-by: Logan Gunthorpe Signed-off-by: Jon Mason --- Documentation/ntb.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt index 074a423c853c..87d1372da879 100644 --- a/Documentation/ntb.txt +++ b/Documentation/ntb.txt @@ -200,6 +200,33 @@ Debugfs Files: This file is used to read and write peer scratchpads. See *spad* for details. +NTB MSI Test Client (ntb\_msi\_test) +------------------------------------ + +The MSI test client serves to test and debug the MSI library which +allows for passing MSI interrupts across NTB memory windows. The +test client is interacted with through the debugfs filesystem: + +* *debugfs*/ntb\_tool/*hw*/ + A directory in debugfs will be created for each + NTB device probed by the tool. This directory is shortened to *hw* + below. +* *hw*/port + This file describes the local port number +* *hw*/irq*_occurrences + One occurrences file exists for each interrupt and, when read, + returns the number of times the interrupt has been triggered. +* *hw*/peer*/port + This file describes the port number for each peer +* *hw*/peer*/count + This file describes the number of interrupts that can be + triggered on each peer +* *hw*/peer*/trigger + Writing an interrupt number (any number less than the value + specified in count) will trigger the interrupt on the + specified peer. That peer's interrupt's occurrence file + should be incremented. + NTB Hardware Drivers ==================== -- cgit From 2d3c72ed504196edb2f22a08cb03a0f9fb7e564f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Jun 2019 16:49:11 -0300 Subject: rdma: Remove nes This driver was first merged over 10 years ago and has not seen major activity by the authors in the last 7 years. However, in that time it has been patched 150 times to adapt it to changing kernel APIs. Further, the hardware has several issues, like not supporting 64 bit DMA, that make it rather uninteresting for use with modern systems and RDMA. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Reviewed-by: Shiraz Saleem Signed-off-by: Doug Ledford --- Documentation/ABI/stable/sysfs-class-infiniband | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband index 17211ceb9bf4..aed21b8916a2 100644 --- a/Documentation/ABI/stable/sysfs-class-infiniband +++ b/Documentation/ABI/stable/sysfs-class-infiniband @@ -423,23 +423,6 @@ Description: (e.g. driver restart on the VM which owns the VF). -sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes) ---------------------------------------------------------------- - -What: /sys/class/infiniband/nesX/hw_rev -What: /sys/class/infiniband/nesX/hca_type -What: /sys/class/infiniband/nesX/board_id -Date: Feb, 2008 -KernelVersion: v2.6.25 -Contact: linux-rdma@vger.kernel.org -Description: - hw_rev: (RO) Hardware revision number - - hca_type: (RO) Host Channel Adapter type (NEX020) - - board_id: (RO) Manufacturing board id - - sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4) ----------------------------------------------------- -- cgit From f7a6463e389e9222e5d6fb60211673e06a307438 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 17 May 2019 16:18:13 -0500 Subject: dt-bindings: vendor-prefixes: Also allow node names starting with '_' Generated nodes for overlays begin with '_'. The binding examples are built as overlays in order to allow unresolved phandles, so we need to allow the generated node names. Reviewed-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index e68c839e9f51..a777168432d0 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1028,7 +1028,7 @@ patternProperties: # Normal property name match without a comma # These should catch all node/property names without a prefix - "^[a-zA-Z0-9#][a-zA-Z0-9+\\-._@]{0,63}$": true + "^[a-zA-Z0-9#_][a-zA-Z0-9+\\-._@]{0,63}$": true "^[a-zA-Z0-9+\\-._]*@[0-9a-zA-Z,]*$": true "^#.*": true -- cgit From 837158b847a48d886f23937b539965f83917cc1f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 17 May 2019 16:17:08 -0500 Subject: dt-bindings: Check the examples against the schemas Currently, the binding examples are just built with dtc. dtc recently gained the support necessary to output the examples in YAML format (commit 87963ee20693 ("livetree: add missing type markers in generated overlay properties"). Now just switch the output format and the examples will be checked against the schema. Reviewed-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile index 8a2774b5834b..6b0dfd5c17ba 100644 --- a/Documentation/devicetree/bindings/Makefile +++ b/Documentation/devicetree/bindings/Makefile @@ -25,7 +25,7 @@ DT_DOCS = $(shell \ DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS)) extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES)) -extra-y += $(patsubst $(src)/%.yaml,%.example.dtb, $(DT_SCHEMA_FILES)) +extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES)) $(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE $(call if_changed,mk_schema) -- cgit From 630eccfd0a0355dc18b7e4456d3479901fb108e9 Mon Sep 17 00:00:00 2001 From: Hannes Schmelzer Date: Fri, 3 May 2019 11:23:32 +0200 Subject: Documentation: devicetree: Add vendor prefix for B&R Industrial Automation GmbH Signed-off-by: Hannes Schmelzer [robh: rework for schema] Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index a777168432d0..e0a3e5ff3825 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -147,6 +147,8 @@ patternProperties: description: Broadcom Corporation "^buffalo,.*": description: Buffalo, Inc. + "^bur,.*": + description: B&R Industrial Automation GmbH "^bticino,.*": description: Bticino International "^calxeda,.*": -- cgit From 00091c0da136f36feaa974f761fd0fc8905a29c9 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 11 Jun 2019 17:15:13 -0700 Subject: Documentation: net: mlx5: Add mlx5 initial documentation Add initial documentation for mlx5 driver. Signed-off-by: Saeed Mahameed --- Documentation/networking/device_drivers/index.rst | 1 + .../networking/device_drivers/mellanox/mlx5.rst | 101 +++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 Documentation/networking/device_drivers/mellanox/mlx5.rst (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 75fa537763a4..24598d5f8ffa 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -21,6 +21,7 @@ Contents: intel/i40e intel/iavf intel/ice + mellanox/mlx5 .. only:: subproject diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst new file mode 100644 index 000000000000..0be802186d12 --- /dev/null +++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst @@ -0,0 +1,101 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +================================================= +Mellanox ConnectX(R) mlx5 core VPI Network Driver +================================================= + +Copyright (c) 2019, Mellanox Technologies LTD. + +Contents +======== + +- `Enabling the driver and kconfig options`_ + +Enabling the driver and kconfig options +================================================ + +| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out) +| at build time via kernel Kconfig flags. +| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags +| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y. +| For the list of advanced features please see below. + +**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko) + +| The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config. +| This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib). + + +**CONFIG_MLX5_CORE_EN=(y/n)** + +| Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads. +| mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be +| built-in into mlx5_core.ko. + + +**CONFIG_MLX5_EN_ARFS=(y/n)** + +| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering. +| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4 + + +**CONFIG_MLX5_EN_RXNFC=(y/n)** + +| Enables ethtool receive network flow classification, which allows user defined +| flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API. + + +**CONFIG_MLX5_CORE_EN_DCB=(y/n)**: + +| Enables `Data Center Bridging (DCB) Support `_. + + +**CONFIG_MLX5_MPFS=(y/n)** + +| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC. +| MPFs is required for when `Multi-Host `_ configuration is enabled to allow passing +| user configured unicast MAC addresses to the requesting PF. + + +**CONFIG_MLX5_ESWITCH=(y/n)** + +| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering +| and switching for the enabled VFs and PF in two available modes: +| 1) `Legacy SRIOV mode (L2 mac vlan steering based) `_. +| 2) `Switchdev mode (eswitch offloads) `_. + + +**CONFIG_MLX5_CORE_IPOIB=(y/n)** + +| IPoIB offloads & acceleration support. +| Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma +| IPoIB ulp netdevice. + + +**CONFIG_MLX5_FPGA=(y/n)** + +| Build support for the Innova family of network cards by Mellanox Technologies. +| Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board. +| If you select this option, the mlx5_core driver will include the Innova FPGA core and allow +| building sandbox-specific client drivers. + + +**CONFIG_MLX5_EN_IPSEC=(y/n)** + +| Enables `IPSec XFRM cryptography-offload accelaration `_. + +**CONFIG_MLX5_EN_TLS=(y/n)** + +| TLS cryptography-offload accelaration. + + +**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko) + +| Provides low-level InfiniBand/RDMA and `RoCE `_ support. + + +**External options** ( Choose if the corresponding mlx5 feature is required ) + +- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled +- CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled. +- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). -- cgit From 06efeb555524a8c65ef429f2603885c31a5212b1 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 1 Jun 2019 16:40:16 +0300 Subject: Documentation: net: mlx5: Devlink health documentation Documentation for devlink health reporters supported by mlx5. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../networking/device_drivers/mellanox/mlx5.rst | 72 ++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst index 0be802186d12..4eeef2df912f 100644 --- a/Documentation/networking/device_drivers/mellanox/mlx5.rst +++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst @@ -10,6 +10,7 @@ Contents ======== - `Enabling the driver and kconfig options`_ +- `Devlink health reporters`_ Enabling the driver and kconfig options ================================================ @@ -99,3 +100,74 @@ Enabling the driver and kconfig options - CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled - CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled. - CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). + + +Devlink health reporters +======================== + +tx reporter +----------- +The tx reporter is responsible of two error scenarios: + +- TX timeout + Report on kernel tx timeout detection. + Recover by searching lost interrupts. +- TX error completion + Report on error tx completion. + Recover by flushing the TX queue and reset it. + +TX reporter also support Diagnose callback, on which it provides +real time information of its send queues status. + +User commands examples: + +- Diagnose send queues status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter tx + +- Show number of tx errors indicated, number of recover flows ended successfully, + is autorecover enabled and graceful period from last recover:: + + $ devlink health show pci/0000:82:00.0 reporter tx + +fw reporter +----------- +The fw reporter implements diagnose and dump callbacks. +It follows symptoms of fw error such as fw syndrome by triggering +fw core dump and storing it into the dump buffer. +The fw reporter diagnose command can be triggered any time by the user to check +current fw status. + +User commands examples: + +- Check fw heath status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter fw + +- Read FW core dump if already stored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.0 reporter fw + +NOTE: This command can run only on the PF which has fw tracer ownership, +running it on other PF or any VF will return "Operation not permitted". + +fw fatal reporter +----------------- +The fw fatal reporter implements dump and recover callbacks. +It follows fatal errors indications by CR-space dump and recover flow. +The CR-space dump uses vsc interface which is valid even if the FW command +interface is not functional, which is the case in most FW fatal errors. +The recover function runs recover flow which reloads the driver and triggers fw +reset if needed. + +User commands examples: + +- Run fw recover flow manually:: + + $ devlink health recover pci/0000:82:00.0 reporter fw_fatal + +- Read FW CR-space dump if already strored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal + +NOTE: This command can run only on PF. -- cgit From 573748081a66b2baacce5e0927c700e480a60c7c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 14 May 2019 10:50:24 +0200 Subject: dt-bindings: property-units: Sanitize unit naming Make the naming of units consistent with common practices: - Do not capitalize the first character of units ("Celsius" is special, as it is not the unit name, but a reference to its proposer), - Do not use plural for units, - Do not abbreviate "ampere", - Concatenate prefixes and units (no spaces or hyphens), - Separate units by spaces not hyphens, - "milli" applies to "degree", not to "Celsius". Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- .../devicetree/bindings/property-units.txt | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/property-units.txt b/Documentation/devicetree/bindings/property-units.txt index bfd33734faca..e9b8360b3288 100644 --- a/Documentation/devicetree/bindings/property-units.txt +++ b/Documentation/devicetree/bindings/property-units.txt @@ -12,32 +12,32 @@ unit prefixes. Time/Frequency ---------------------------------------- -mhz : megahertz --hz : Hertz (preferred) --sec : seconds --ms : milliseconds --us : microseconds --ns : nanoseconds +-hz : hertz (preferred) +-sec : second +-ms : millisecond +-us : microsecond +-ns : nanosecond Distance ---------------------------------------- --mm : millimeters +-mm : millimeter Electricity ---------------------------------------- --microamp : micro amps --microamp-hours : micro amp-hours --ohms : Ohms --micro-ohms : micro Ohms --microwatt-hours: micro Watt-hours --microvolt : micro volts --picofarads : picofarads --femtofarads : femtofarads +-microamp : microampere +-microamp-hours : microampere hour +-ohms : ohm +-micro-ohms : microohm +-microwatt-hours: microwatt hour +-microvolt : microvolt +-picofarads : picofarad +-femtofarads : femtofarad Temperature ---------------------------------------- --celsius : Degrees Celsius --millicelsius : Degreee milli-Celsius +-celsius : degree Celsius +-millicelsius : millidegree Celsius Pressure ---------------------------------------- --kpascal : kiloPascal +-kpascal : kilopascal -- cgit From 9129b017b54dab09eb69b7269026243156e5188e Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 27 May 2019 10:49:57 +0200 Subject: rcu: Don't return a value from rcu_assign_pointer() Quoting Paul [1]: "Given that a quick (and perhaps error-prone) search of the uses of rcu_assign_pointer() in v5.1 didn't find a single use of the return value, let's please instead change the documentation and implementation to eliminate the return value." [1] https://lkml.kernel.org/r/20190523135013.GL28207@linux.ibm.com Signed-off-by: Andrea Parri Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Cc: Peter Zijlstra Cc: Will Deacon Cc: Mark Rutland Cc: Matthew Wilcox Cc: Sasha Levin Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 981651a8b65d..7e1a8721637a 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -212,7 +212,7 @@ synchronize_rcu() rcu_assign_pointer() - typeof(p) rcu_assign_pointer(p, typeof(p) v); + void rcu_assign_pointer(p, typeof(p) v); Yes, rcu_assign_pointer() -is- implemented as a macro, though it would be cool to be able to declare a function in this manner. @@ -220,9 +220,9 @@ rcu_assign_pointer() The updater uses this function to assign a new value to an RCU-protected pointer, in order to safely communicate the change - in value from the updater to the reader. This function returns - the new value, and also executes any memory-barrier instructions - required for a given CPU architecture. + in value from the updater to the reader. This macro does not + evaluate to an rvalue, but it does execute any memory-barrier + instructions required for a given CPU architecture. Perhaps just as important, it serves to document (1) which pointers are protected by RCU and (2) the point at which a -- cgit From 4c0a59e1123f90b773d46942a6d505b3cb5bd406 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Tue, 21 May 2019 15:05:12 +0530 Subject: Documentation: arm: Link idle-states binding to "enable-method" property The "enable-method" property for cpu nodes needs to be "psci" for CPU idle management to be setup correctly. Add a note to the binding documentation to this effect to make it obvious. Signed-off-by: Amit Kucheria Acked-by: Sudeep Holla Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/arm/idle-states.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/idle-states.txt b/Documentation/devicetree/bindings/arm/idle-states.txt index 45730ba60af5..3bdbe675b9e6 100644 --- a/Documentation/devicetree/bindings/arm/idle-states.txt +++ b/Documentation/devicetree/bindings/arm/idle-states.txt @@ -241,9 +241,13 @@ processor idle states, defined as device tree nodes, are listed. - "psci" # On ARM 32-bit systems this property is optional -The nodes describing the idle states (state) can only be defined within the -idle-states node, any other configuration is considered invalid and therefore -must be ignored. +This assumes that the "enable-method" property is set to "psci" in the cpu +node[6] that is responsible for setting up CPU idle management in the OS +implementation. + +The nodes describing the idle states (state) can only be defined +within the idle-states node, any other configuration is considered invalid +and therefore must be ignored. =========================================== 4 - state node @@ -697,3 +701,6 @@ cpus { [5] Devicetree Specification https://www.devicetree.org/specifications/ + +[6] ARM Linux Kernel documentation - Booting AArch64 Linux + Documentation/arm64/booting.txt -- cgit From 0ed91bded307cc980f9542eff861266f1744c303 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Tue, 11 Jun 2019 10:34:32 -0500 Subject: dt-bindings: pl330: document the optional resets property Add the optional resets property the pl330 dma node. Signed-off-by: Dinh Nguyen Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/arm-pl330.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt b/Documentation/devicetree/bindings/dma/arm-pl330.txt index db7e2260f9c5..2c7fd1941abb 100644 --- a/Documentation/devicetree/bindings/dma/arm-pl330.txt +++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt @@ -16,6 +16,9 @@ Optional properties: - dma-channels: contains the total number of DMA channels supported by the DMAC - dma-requests: contains the total number of DMA requests supported by the DMAC - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP + - resets: contains an entry for each entry in reset-names. + See ../reset/reset.txt for details. + - reset-names: must contain at least "dma", and optional is "dma-ocp". Example: -- cgit From 1e87fec9fa52a6f7c223998d6bfbd3464eb37e31 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 16 May 2019 11:44:52 +0200 Subject: mac80211: call rate_control_send_low() internally There's no rate control algorithm that *doesn't* want to call it internally, and calling it internally will let us modify its behaviour in the future. Signed-off-by: Johannes Berg --- Documentation/driver-api/80211/mac80211-advanced.rst | 3 --- 1 file changed, 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/80211/mac80211-advanced.rst b/Documentation/driver-api/80211/mac80211-advanced.rst index 70a89b2163c2..9f1c5bb7ac35 100644 --- a/Documentation/driver-api/80211/mac80211-advanced.rst +++ b/Documentation/driver-api/80211/mac80211-advanced.rst @@ -226,9 +226,6 @@ TBD .. kernel-doc:: include/net/mac80211.h :functions: ieee80211_tx_rate_control -.. kernel-doc:: include/net/mac80211.h - :functions: rate_control_send_low - TBD This part of the book describes mac80211 internals. -- cgit From c2aacceedc86af87428d998e23a1aca24fd8aa2e Mon Sep 17 00:00:00 2001 From: Nick Xie Date: Mon, 10 Jun 2019 15:57:53 +0800 Subject: arm64: dts: rockchip: Add support for Khadas Edge/Edge-V/Captain boards Add devicetree support for Khadas Edge/Edge-V/Captain boards. Khadas Edge is an expandable Rockchip RK3399 board with goldfinger. Khadas Captain is the carrier board for Khadas Edge. Khadas Edge-V is a Khadas VIM form factor Rockchip RK3399 board. Signed-off-by: Nick Xie [edge-captain and edge-v contain different components that are supposed to get added in future patches, so should stay separate while looking somewhat similar right now] Signed-off-by: Heiko Stuebner --- Documentation/devicetree/bindings/arm/rockchip.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml index 5c6bbf10abc9..eef822ce2ad4 100644 --- a/Documentation/devicetree/bindings/arm/rockchip.yaml +++ b/Documentation/devicetree/bindings/arm/rockchip.yaml @@ -316,6 +316,14 @@ properties: - const: haoyu,marsboard-rk3066 - const: rockchip,rk3066a + - description: Khadas Edge series boards + items: + - enum: + - khadas,edge + - khadas,edge-captain + - khadas,edge-v + - const: rockchip,rk3399 + - description: mqmaker MiQi items: - const: mqmaker,miqi -- cgit From ed66bcd0674a97cbacb01e7d140eea0f882d31a6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 23:04:07 -0300 Subject: ABI: fix some syntax issues at the ABI database On those three files, the ABI representation described at README are violated. - at sysfs-bus-iio-proximity-as3935: a ':' character is missing after "What" - at sysfs-class-devfreq: there's a typo at Description - at sysfs-class-cxl, it is using the ":" character at a file preamble, causing it to be misinterpreted as a tag. - On the other files, instead of "What", they use "Where". Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab Acked-by: Rafael J. Wysocki Acked-by: Andrew Donnellan # cxl Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/pstore | 2 +- .../testing/sysfs-bus-event_source-devices-format | 2 +- .../ABI/testing/sysfs-bus-i2c-devices-hm6352 | 6 +++--- .../ABI/testing/sysfs-bus-iio-distance-srf08 | 4 ++-- .../ABI/testing/sysfs-bus-iio-proximity-as3935 | 4 ++-- .../ABI/testing/sysfs-bus-pci-devices-cciss | 22 +++++++++++----------- .../ABI/testing/sysfs-bus-usb-devices-usbsevseg | 12 ++++++------ Documentation/ABI/testing/sysfs-class-cxl | 6 +++--- Documentation/ABI/testing/sysfs-class-devfreq | 2 +- Documentation/ABI/testing/sysfs-class-powercap | 2 +- Documentation/ABI/testing/sysfs-kernel-fscaps | 2 +- Documentation/ABI/testing/sysfs-kernel-vmcoreinfo | 2 +- 12 files changed, 33 insertions(+), 33 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore index 5fca9f5e10a3..8d6e48f4e8ef 100644 --- a/Documentation/ABI/testing/pstore +++ b/Documentation/ABI/testing/pstore @@ -1,4 +1,4 @@ -Where: /sys/fs/pstore/... (or /dev/pstore/...) +What: /sys/fs/pstore/... (or /dev/pstore/...) Date: March 2011 Kernel Version: 2.6.39 Contact: tony.luck@intel.com diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format index 77f47ff5ee02..b6f8748e0200 100644 --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format @@ -1,4 +1,4 @@ -Where: /sys/bus/event_source/devices//format +What: /sys/bus/event_source/devices//format Date: January 2012 Kernel Version: 3.3 Contact: Jiri Olsa diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 index feb2e4a87075..29bd447e50a0 100644 --- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 +++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 @@ -1,18 +1,18 @@ -Where: /sys/bus/i2c/devices/.../heading0_input +What: /sys/bus/i2c/devices/.../heading0_input Date: April 2010 Kernel Version: 2.6.36? Contact: alan.cox@intel.com Description: Reports the current heading from the compass as a floating point value in degrees. -Where: /sys/bus/i2c/devices/.../power_state +What: /sys/bus/i2c/devices/.../power_state Date: April 2010 Kernel Version: 2.6.36? Contact: alan.cox@intel.com Description: Sets the power state of the device. 0 sets the device into sleep mode, 1 wakes it up. -Where: /sys/bus/i2c/devices/.../calibration +What: /sys/bus/i2c/devices/.../calibration Date: April 2010 Kernel Version: 2.6.36? Contact: alan.cox@intel.com diff --git a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 index 0a1ca1487fa9..a133fd8d081a 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 +++ b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 @@ -1,4 +1,4 @@ -What /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity +What: /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity Date: January 2017 KernelVersion: 4.11 Contact: linux-iio@vger.kernel.org @@ -6,7 +6,7 @@ Description: Show or set the gain boost of the amp, from 0-31 range. default 31 -What /sys/bus/iio/devices/iio:deviceX/sensor_max_range +What: /sys/bus/iio/devices/iio:deviceX/sensor_max_range Date: January 2017 KernelVersion: 4.11 Contact: linux-iio@vger.kernel.org diff --git a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 index 9a17ab5036a4..c59d95346341 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 +++ b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 @@ -1,4 +1,4 @@ -What /sys/bus/iio/devices/iio:deviceX/in_proximity_input +What: /sys/bus/iio/devices/iio:deviceX/in_proximity_input Date: March 2014 KernelVersion: 3.15 Contact: Matt Ranostay @@ -6,7 +6,7 @@ Description: Get the current distance in meters of storm (1km steps) 1000-40000 = distance in meters -What /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity +What: /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity Date: March 2014 KernelVersion: 3.15 Contact: Matt Ranostay diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss index 53d99edd1d75..eb449169c30b 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss @@ -1,66 +1,66 @@ -Where: /sys/bus/pci/devices//ccissX/cXdY/model +What: /sys/bus/pci/devices//ccissX/cXdY/model Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: Displays the SCSI INQUIRY page 0 model for logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/rev +What: /sys/bus/pci/devices//ccissX/cXdY/rev Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: Displays the SCSI INQUIRY page 0 revision for logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/unique_id +What: /sys/bus/pci/devices//ccissX/cXdY/unique_id Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: Displays the SCSI INQUIRY page 83 serial number for logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/vendor +What: /sys/bus/pci/devices//ccissX/cXdY/vendor Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: Displays the SCSI INQUIRY page 0 vendor for logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/block:cciss!cXdY +What: /sys/bus/pci/devices//ccissX/cXdY/block:cciss!cXdY Date: March 2009 Kernel Version: 2.6.30 Contact: iss_storagedev@hp.com Description: A symbolic link to /sys/block/cciss!cXdY -Where: /sys/bus/pci/devices//ccissX/rescan +What: /sys/bus/pci/devices//ccissX/rescan Date: August 2009 Kernel Version: 2.6.31 Contact: iss_storagedev@hp.com Description: Kicks of a rescan of the controller to discover logical drive topology changes. -Where: /sys/bus/pci/devices//ccissX/cXdY/lunid +What: /sys/bus/pci/devices//ccissX/cXdY/lunid Date: August 2009 Kernel Version: 2.6.31 Contact: iss_storagedev@hp.com Description: Displays the 8-byte LUN ID used to address logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/raid_level +What: /sys/bus/pci/devices//ccissX/cXdY/raid_level Date: August 2009 Kernel Version: 2.6.31 Contact: iss_storagedev@hp.com Description: Displays the RAID level of logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/cXdY/usage_count +What: /sys/bus/pci/devices//ccissX/cXdY/usage_count Date: August 2009 Kernel Version: 2.6.31 Contact: iss_storagedev@hp.com Description: Displays the usage count (number of opens) of logical drive Y of controller X. -Where: /sys/bus/pci/devices//ccissX/resettable +What: /sys/bus/pci/devices//ccissX/resettable Date: February 2011 Kernel Version: 2.6.38 Contact: iss_storagedev@hp.com @@ -71,7 +71,7 @@ Description: Value of 1 indicates the controller can honor the reset_devices a dump device, as kdump requires resetting the device in order to work reliably. -Where: /sys/bus/pci/devices//ccissX/transport_mode +What: /sys/bus/pci/devices//ccissX/transport_mode Date: July 2011 Kernel Version: 3.0 Contact: iss_storagedev@hp.com diff --git a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg index 70d00dfa443d..f6199b314196 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg +++ b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg @@ -1,12 +1,12 @@ -Where: /sys/bus/usb/.../powered +What: /sys/bus/usb/.../powered Date: August 2008 Kernel Version: 2.6.26 Contact: Harrison Metzger Description: Controls whether the device's display will powered. A value of 0 is off and a non-zero value is on. -Where: /sys/bus/usb/.../mode_msb -Where: /sys/bus/usb/.../mode_lsb +What: /sys/bus/usb/.../mode_msb +What: /sys/bus/usb/.../mode_lsb Date: August 2008 Kernel Version: 2.6.26 Contact: Harrison Metzger @@ -16,7 +16,7 @@ Description: Controls the devices display mode. for an 8 character display the values are MSB 0x08; LSB 0xFF. -Where: /sys/bus/usb/.../textmode +What: /sys/bus/usb/.../textmode Date: August 2008 Kernel Version: 2.6.26 Contact: Harrison Metzger @@ -25,13 +25,13 @@ Description: Controls the way the device interprets its text buffer. hex: each character is between 0-15 ascii: each character is between '0'-'9' and 'A'-'F'. -Where: /sys/bus/usb/.../text +What: /sys/bus/usb/.../text Date: August 2008 Kernel Version: 2.6.26 Contact: Harrison Metzger Description: The text (or data) for the device to display -Where: /sys/bus/usb/.../decimals +What: /sys/bus/usb/.../decimals Date: August 2008 Kernel Version: 2.6.26 Contact: Harrison Metzger diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl index bbbabffc682a..7970e3713e70 100644 --- a/Documentation/ABI/testing/sysfs-class-cxl +++ b/Documentation/ABI/testing/sysfs-class-cxl @@ -1,6 +1,6 @@ -Note: Attributes that are shared between devices are stored in the directory -pointed to by the symlink device/. -Example: The real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is +Please note that attributes that are shared between devices are stored in +the directory pointed to by the symlink device/. +For example, the real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is /sys/class/cxl/afu0.0s/device/irqs_max, i.e. /sys/class/cxl/afu0.0/irqs_max. diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq index ee39acacf6f8..01196e19afca 100644 --- a/Documentation/ABI/testing/sysfs-class-devfreq +++ b/Documentation/ABI/testing/sysfs-class-devfreq @@ -47,7 +47,7 @@ Description: What: /sys/class/devfreq/.../trans_stat Date: October 2012 Contact: MyungJoo Ham -Descrtiption: +Description: This ABI shows the statistics of devfreq behavior on a specific device. It shows the time spent in each state and the number of transitions between states. diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap index db3b3ff70d84..f333a0ccc29b 100644 --- a/Documentation/ABI/testing/sysfs-class-powercap +++ b/Documentation/ABI/testing/sysfs-class-powercap @@ -147,6 +147,6 @@ What: /sys/class/powercap/...//enabled Date: September 2013 KernelVersion: 3.13 Contact: linux-pm@vger.kernel.org -Description +Description: This allows to enable/disable power capping at power zone level. This applies to current power zone and its children. diff --git a/Documentation/ABI/testing/sysfs-kernel-fscaps b/Documentation/ABI/testing/sysfs-kernel-fscaps index 50a3033b5e15..bcff34665192 100644 --- a/Documentation/ABI/testing/sysfs-kernel-fscaps +++ b/Documentation/ABI/testing/sysfs-kernel-fscaps @@ -2,7 +2,7 @@ What: /sys/kernel/fscaps Date: February 2011 KernelVersion: 2.6.38 Contact: Ludwig Nussel -Description +Description: Shows whether file system capabilities are honored when executing a binary diff --git a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo index 7bd81168e063..1f1087a5f075 100644 --- a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo +++ b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo @@ -4,7 +4,7 @@ KernelVersion: 2.6.24 Contact: Ken'ichi Ohmichi Kexec Mailing List Vivek Goyal -Description +Description: Shows physical address and size of vmcoreinfo ELF note. First value contains physical address of note in hex and second value contains the size of note in hex. This ELF -- cgit From 745b2888a2afdb33055f62ba3b245b79e2462e65 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 23:04:08 -0300 Subject: ABI: sysfs-driver-hid: the "What" field doesn't parse fine The What: field on this ABI description use a different syntax than expected, causing it to not be properly parsed by script. Fix it. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-driver-hid | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-driver-hid b/Documentation/ABI/testing/sysfs-driver-hid index 48942cacb0bf..a59533410871 100644 --- a/Documentation/ABI/testing/sysfs-driver-hid +++ b/Documentation/ABI/testing/sysfs-driver-hid @@ -1,6 +1,6 @@ -What: For USB devices : /sys/bus/usb/devices/-:./::./report_descriptor - For BT devices : /sys/class/bluetooth/hci/::./report_descriptor - Symlink : /sys/class/hidraw/hidraw/device/report_descriptor +What: /sys/bus/usb/devices/-:./::./report_descriptor +What: /sys/class/bluetooth/hci/::./report_descriptor +What: /sys/class/hidraw/hidraw/device/report_descriptor Date: Jan 2011 KernelVersion: 2.0.39 Contact: Alan Ott @@ -9,9 +9,9 @@ Description: When read, this file returns the device's raw binary HID This file cannot be written. Users: HIDAPI library (http://www.signal11.us/oss/hidapi) -What: For USB devices : /sys/bus/usb/devices/-:./::./country - For BT devices : /sys/class/bluetooth/hci/::./country - Symlink : /sys/class/hidraw/hidraw/device/country +What: /sys/bus/usb/devices/-:./::./country +What: /sys/class/bluetooth/hci/::./country +What: /sys/class/hidraw/hidraw/device/country Date: February 2015 KernelVersion: 3.19 Contact: Olivier Gay -- cgit From d59f0ec7151e8f70ceaf58e2ddd057e3296cc2d7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 23:04:09 -0300 Subject: ABI: sysfs-class-uwb_rc: remove a duplicated incomplete entry There are two entries for /sys/class/uwb_rc/uwbN//BPST. The second one has a missing description. Get rid of it. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-class-uwb_rc | 6 ------ 1 file changed, 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc index 85f4875d16ac..a0578751c1e3 100644 --- a/Documentation/ABI/testing/sysfs-class-uwb_rc +++ b/Documentation/ABI/testing/sysfs-class-uwb_rc @@ -125,12 +125,6 @@ Description: The EUI-48 of this device in colon separated hex octets. -What: /sys/class/uwb_rc/uwbN//BPST -Date: July 2008 -KernelVersion: 2.6.27 -Contact: linux-usb@vger.kernel.org -Description: - What: /sys/class/uwb_rc/uwbN//IEs Date: July 2008 KernelVersion: 2.6.27 -- cgit From 1107049034ac094dd66998ec91d59d8f8807c88a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 23:04:10 -0300 Subject: ABI: better identificate tables When parsing via script, it is important to know if the script should consider a description as a literal block that should be displayed as-is, or if the description can be considered as a normal text. Change descriptions to ensure that the preceding line of a table ends with a colon. That makes easy to identify the need of a literal block. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra | 2 +- Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 | 6 +++--- Documentation/ABI/testing/sysfs-class-led-driver-lm3533 | 8 ++++---- Documentation/ABI/testing/sysfs-class-leds-gt683r | 4 ++-- Documentation/ABI/testing/sysfs-driver-hid-roccat-kone | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra index 16020b31ae64..5d41ebadf15e 100644 --- a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra +++ b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra @@ -5,7 +5,7 @@ Description: It is possible to switch the cpi setting of the mouse with the press of a button. When read, this file returns the raw number of the actual cpi setting reported by the mouse. This number has to be further - processed to receive the real dpi value. + processed to receive the real dpi value: VALUE DPI 1 400 diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 index 77cf7ac949af..c0e0a9ae7b3d 100644 --- a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 +++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 @@ -4,7 +4,7 @@ KernelVersion: 3.5 Contact: Johan Hovold Description: Get the ALS output channel used as input in - ALS-current-control mode (0, 1), where + ALS-current-control mode (0, 1), where: 0 - out_current0 (backlight 0) 1 - out_current1 (backlight 1) @@ -28,7 +28,7 @@ Date: April 2012 KernelVersion: 3.5 Contact: Johan Hovold Description: - Set the brightness-mapping mode (0, 1), where + Set the brightness-mapping mode (0, 1), where: 0 - exponential mode 1 - linear mode @@ -38,7 +38,7 @@ Date: April 2012 KernelVersion: 3.5 Contact: Johan Hovold Description: - Set the PWM-input control mask (5 bits), where + Set the PWM-input control mask (5 bits), where: bit 5 - PWM-input enabled in Zone 4 bit 4 - PWM-input enabled in Zone 3 diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 index 620ebb3b9baa..e4c89b261546 100644 --- a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 +++ b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 @@ -4,7 +4,7 @@ KernelVersion: 3.5 Contact: Johan Hovold Description: Set the ALS output channel to use as input in - ALS-current-control mode (1, 2), where + ALS-current-control mode (1, 2), where: 1 - out_current1 2 - out_current2 @@ -22,7 +22,7 @@ Date: April 2012 KernelVersion: 3.5 Contact: Johan Hovold Description: - Set the pattern generator fall and rise times (0..7), where + Set the pattern generator fall and rise times (0..7), where: 0 - 2048 us 1 - 262 ms @@ -45,7 +45,7 @@ Date: April 2012 KernelVersion: 3.5 Contact: Johan Hovold Description: - Set the brightness-mapping mode (0, 1), where + Set the brightness-mapping mode (0, 1), where: 0 - exponential mode 1 - linear mode @@ -55,7 +55,7 @@ Date: April 2012 KernelVersion: 3.5 Contact: Johan Hovold Description: - Set the PWM-input control mask (5 bits), where + Set the PWM-input control mask (5 bits), where: bit 5 - PWM-input enabled in Zone 4 bit 4 - PWM-input enabled in Zone 3 diff --git a/Documentation/ABI/testing/sysfs-class-leds-gt683r b/Documentation/ABI/testing/sysfs-class-leds-gt683r index e4fae6026e79..6adab27f646e 100644 --- a/Documentation/ABI/testing/sysfs-class-leds-gt683r +++ b/Documentation/ABI/testing/sysfs-class-leds-gt683r @@ -5,7 +5,7 @@ Contact: Janne Kanniainen Description: Set the mode of LEDs. You should notice that changing the mode of one LED will update the mode of its two sibling devices as - well. + well. Possible values are: 0 - normal 1 - audio @@ -13,4 +13,4 @@ Description: Normal: LEDs are fully on when enabled Audio: LEDs brightness depends on sound level - Breathing: LEDs brightness varies at human breathing rate \ No newline at end of file + Breathing: LEDs brightness varies at human breathing rate diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone index 3ca3971109bf..8f7982c70d72 100644 --- a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone +++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone @@ -5,7 +5,7 @@ Description: It is possible to switch the dpi setting of the mouse with the press of a button. When read, this file returns the raw number of the actual dpi setting reported by the mouse. This number has to be further - processed to receive the real dpi value. + processed to receive the real dpi value: VALUE DPI 1 800 -- cgit From ce1a5ea18ef9bf4c62c75abe7c540a29264ec988 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Jun 2019 09:02:49 +0200 Subject: Documentation: Remove duplicate x86 index entry x86 got added twice to the index via the RST conversion and the MDS documentation changes. Remove one instance. Signed-off-by: Thomas Gleixner Signed-off-by: Jonathan Corbet --- Documentation/index.rst | 1 - 1 file changed, 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/index.rst b/Documentation/index.rst index a7566ef62411..781042b4579d 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -112,7 +112,6 @@ implementation. .. toctree:: :maxdepth: 2 - x86/index sh/index x86/index -- cgit From 1419f64ef526b5657ccfe604a69533736db44490 Mon Sep 17 00:00:00 2001 From: Erwan Le Ray Date: Fri, 24 May 2019 17:30:38 +0200 Subject: dt-bindings: stm32: serial: Add optional reset STM32 serial can be reset via reset controller. Add an optional reset property to stm32 usart bindings. Signed-off-by: Erwan Le Ray Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/serial/st,stm32-usart.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt index 9d3efed55deb..a6b19485c9dc 100644 --- a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt +++ b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt @@ -13,6 +13,7 @@ Required properties: - clocks: The input clock of the USART instance Optional properties: +- resets: Must contain the phandle to the reset controller. - pinctrl: The reference on the pins configuration - st,hw-flow-ctrl: bool flag to enable hardware flow control. - rs485-rts-delay, rs485-rx-during-tx, rs485-rts-active-low, -- cgit From abdcfc25641c5ba6d63047bec1dc8d3aaa7d4111 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 15 May 2019 15:18:56 +0200 Subject: ASoC: meson: add tohdmitx DT bindings Add the bindings and the related documentation for the audio hdmitx control glue of the Amlogic g12a SoC family Signed-off-by: Jerome Brunet Tested-by: Neil Armstrong Tested-by: Kevin Hilman Signed-off-by: Mark Brown --- .../bindings/sound/amlogic,g12a-tohdmitx.txt | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt new file mode 100644 index 000000000000..aa6c35570d31 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt @@ -0,0 +1,55 @@ +* Amlogic HDMI Tx control glue + +Required properties: +- compatible: "amlogic,g12a-tohdmitx" +- reg: physical base address of the controller and length of memory + mapped region. +- #sound-dai-cells: should be 1. + +Example on the S905X2 SoC: + +tohdmitx: audio-controller@744 { + compatible = "amlogic,g12a-tohdmitx"; + reg = <0x0 0x744 0x0 0x4>; + #sound-dai-cells = <1>; +}; + +Example of an 'amlogic,axg-sound-card': + +sound { + compatible = "amlogic,axg-sound-card"; + +[...] + + dai-link-x { + sound-dai = <&tdmif_a>; + dai-format = "i2s"; + dai-tdm-slot-tx-mask-0 = <1 1>; + + codec-0 { + sound-dai = <&tohdmitx TOHDMITX_I2S_IN_A>; + }; + + codec-1 { + sound-dai = <&external_dac>; + }; + }; + + dai-link-y { + sound-dai = <&tdmif_c>; + dai-format = "i2s"; + dai-tdm-slot-tx-mask-0 = <1 1>; + + codec { + sound-dai = <&tohdmitx TOHDMITX_I2S_IN_C>; + }; + }; + + dai-link-z { + sound-dai = <&tohdmitx TOHDMITX_I2S_OUT>; + + codec { + sound-dai = <&hdmi_tx>; + }; + }; +}; -- cgit From 305a99eb98af22996e9771078b7a19978732ed41 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:37 -0300 Subject: docs: aoe: convert docs to ReST and rename to *.rst There are only two files within Documentation/aoe dir that are documentation. The remaining ones are examples and shell scripts. Convert the two AoE files to ReST format, and add the others as literal, as they're part of the documentation. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/aoe/aoe.rst | 150 +++++++++++++++++++++++++++++++++++++++++ Documentation/aoe/aoe.txt | 143 --------------------------------------- Documentation/aoe/examples.rst | 23 +++++++ Documentation/aoe/index.rst | 19 ++++++ Documentation/aoe/todo.rst | 17 +++++ Documentation/aoe/todo.txt | 14 ---- Documentation/aoe/udev.txt | 2 +- 7 files changed, 210 insertions(+), 158 deletions(-) create mode 100644 Documentation/aoe/aoe.rst delete mode 100644 Documentation/aoe/aoe.txt create mode 100644 Documentation/aoe/examples.rst create mode 100644 Documentation/aoe/index.rst create mode 100644 Documentation/aoe/todo.rst delete mode 100644 Documentation/aoe/todo.txt (limited to 'Documentation') diff --git a/Documentation/aoe/aoe.rst b/Documentation/aoe/aoe.rst new file mode 100644 index 000000000000..58747ecec71d --- /dev/null +++ b/Documentation/aoe/aoe.rst @@ -0,0 +1,150 @@ +Introduction +============ + +ATA over Ethernet is a network protocol that provides simple access to +block storage on the LAN. + + http://support.coraid.com/documents/AoEr11.txt + +The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ... + + http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html + +It has many tips and hints! Please see, especially, recommended +tunings for virtual memory: + + http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19 + +The aoetools are userland programs that are designed to work with this +driver. The aoetools are on sourceforge. + + http://aoetools.sourceforge.net/ + +The scripts in this Documentation/aoe directory are intended to +document the use of the driver and are not necessary if you install +the aoetools. + + +Creating Device Nodes +===================== + + Users of udev should find the block device nodes created + automatically, but to create all the necessary device nodes, use the + udev configuration rules provided in udev.txt (in this directory). + + There is a udev-install.sh script that shows how to install these + rules on your system. + + There is also an autoload script that shows how to edit + /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when + necessary. Preloading the aoe module is preferable to autoloading, + however, because AoE discovery takes a few seconds. It can be + confusing when an AoE device is not present the first time the a + command is run but appears a second later. + +Using Device Nodes +================== + + "cat /dev/etherd/err" blocks, waiting for error diagnostic output, + like any retransmitted packets. + + "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to + limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from + untrusted networks should be ignored as a matter of security. See + also the aoe_iflist driver option described below. + + "echo > /dev/etherd/discover" tells the driver to find out what AoE + devices are available. + + In the future these character devices may disappear and be replaced + by sysfs counterparts. Using the commands in aoetools insulates + users from these implementation details. + + The block devices are named like this:: + + e{shelf}.{slot} + e{shelf}.{slot}p{part} + + ... so that "e0.2" is the third blade from the left (slot 2) in the + first shelf (shelf address zero). That's the whole disk. The first + partition on that disk would be "e0.2p1". + +Using sysfs +=========== + + Each aoe block device in /sys/block has the extra attributes of + state, mac, and netif. The state attribute is "up" when the device + is ready for I/O and "down" if detected but unusable. The + "down,closewait" state shows that the device is still open and + cannot come up again until it has been closed. + + The mac attribute is the ethernet address of the remote AoE device. + The netif attribute is the network interface on the localhost + through which we are communicating with the remote AoE device. + + There is a script in this directory that formats this information in + a convenient way. Users with aoetools should use the aoe-stat + command:: + + root@makki root# sh Documentation/aoe/status.sh + e10.0 eth3 up + e10.1 eth3 up + e10.2 eth3 up + e10.3 eth3 up + e10.4 eth3 up + e10.5 eth3 up + e10.6 eth3 up + e10.7 eth3 up + e10.8 eth3 up + e10.9 eth3 up + e4.0 eth1 up + e4.1 eth1 up + e4.2 eth1 up + e4.3 eth1 up + e4.4 eth1 up + e4.5 eth1 up + e4.6 eth1 up + e4.7 eth1 up + e4.8 eth1 up + e4.9 eth1 up + + Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver + option discussed below) instead of /dev/etherd/interfaces to limit + AoE traffic to the network interfaces in the given + whitespace-separated list. Unlike the old character device, the + sysfs entry can be read from as well as written to. + + It's helpful to trigger discovery after setting the list of allowed + interfaces. The aoetools package provides an aoe-discover script + for this purpose. You can also directly use the + /dev/etherd/discover special file described above. + +Driver Options +============== + + There is a boot option for the built-in aoe driver and a + corresponding module parameter, aoe_iflist. Without this option, + all network interfaces may be used for ATA over Ethernet. Here is a + usage example for the module parameter:: + + modprobe aoe_iflist="eth1 eth3" + + The aoe_deadsecs module parameter determines the maximum number of + seconds that the driver will wait for an AoE device to provide a + response to an AoE command. After aoe_deadsecs seconds have + elapsed, the AoE device will be marked as "down". A value of zero + is supported for testing purposes and makes the aoe driver keep + trying AoE commands forever. + + The aoe_maxout module parameter has a default of 128. This is the + maximum number of unresponded packets that will be sent to an AoE + target at one time. + + The aoe_dyndevs module parameter defaults to 1, meaning that the + driver will assign a block device minor number to a discovered AoE + target based on the order of its discovery. With dynamic minor + device numbers in use, a greater range of AoE shelf and slot + addresses can be supported. Users with udev will never have to + think about minor numbers. Using aoe_dyndevs=0 allows device nodes + to be pre-created using a static minor-number scheme with the + aoe-mkshelf script in the aoetools. diff --git a/Documentation/aoe/aoe.txt b/Documentation/aoe/aoe.txt deleted file mode 100644 index c71487d399d1..000000000000 --- a/Documentation/aoe/aoe.txt +++ /dev/null @@ -1,143 +0,0 @@ -ATA over Ethernet is a network protocol that provides simple access to -block storage on the LAN. - - http://support.coraid.com/documents/AoEr11.txt - -The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ... - - http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html - -It has many tips and hints! Please see, especially, recommended -tunings for virtual memory: - - http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19 - -The aoetools are userland programs that are designed to work with this -driver. The aoetools are on sourceforge. - - http://aoetools.sourceforge.net/ - -The scripts in this Documentation/aoe directory are intended to -document the use of the driver and are not necessary if you install -the aoetools. - - -CREATING DEVICE NODES - - Users of udev should find the block device nodes created - automatically, but to create all the necessary device nodes, use the - udev configuration rules provided in udev.txt (in this directory). - - There is a udev-install.sh script that shows how to install these - rules on your system. - - There is also an autoload script that shows how to edit - /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when - necessary. Preloading the aoe module is preferable to autoloading, - however, because AoE discovery takes a few seconds. It can be - confusing when an AoE device is not present the first time the a - command is run but appears a second later. - -USING DEVICE NODES - - "cat /dev/etherd/err" blocks, waiting for error diagnostic output, - like any retransmitted packets. - - "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to - limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from - untrusted networks should be ignored as a matter of security. See - also the aoe_iflist driver option described below. - - "echo > /dev/etherd/discover" tells the driver to find out what AoE - devices are available. - - In the future these character devices may disappear and be replaced - by sysfs counterparts. Using the commands in aoetools insulates - users from these implementation details. - - The block devices are named like this: - - e{shelf}.{slot} - e{shelf}.{slot}p{part} - - ... so that "e0.2" is the third blade from the left (slot 2) in the - first shelf (shelf address zero). That's the whole disk. The first - partition on that disk would be "e0.2p1". - -USING SYSFS - - Each aoe block device in /sys/block has the extra attributes of - state, mac, and netif. The state attribute is "up" when the device - is ready for I/O and "down" if detected but unusable. The - "down,closewait" state shows that the device is still open and - cannot come up again until it has been closed. - - The mac attribute is the ethernet address of the remote AoE device. - The netif attribute is the network interface on the localhost - through which we are communicating with the remote AoE device. - - There is a script in this directory that formats this information in - a convenient way. Users with aoetools should use the aoe-stat - command. - - root@makki root# sh Documentation/aoe/status.sh - e10.0 eth3 up - e10.1 eth3 up - e10.2 eth3 up - e10.3 eth3 up - e10.4 eth3 up - e10.5 eth3 up - e10.6 eth3 up - e10.7 eth3 up - e10.8 eth3 up - e10.9 eth3 up - e4.0 eth1 up - e4.1 eth1 up - e4.2 eth1 up - e4.3 eth1 up - e4.4 eth1 up - e4.5 eth1 up - e4.6 eth1 up - e4.7 eth1 up - e4.8 eth1 up - e4.9 eth1 up - - Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver - option discussed below) instead of /dev/etherd/interfaces to limit - AoE traffic to the network interfaces in the given - whitespace-separated list. Unlike the old character device, the - sysfs entry can be read from as well as written to. - - It's helpful to trigger discovery after setting the list of allowed - interfaces. The aoetools package provides an aoe-discover script - for this purpose. You can also directly use the - /dev/etherd/discover special file described above. - -DRIVER OPTIONS - - There is a boot option for the built-in aoe driver and a - corresponding module parameter, aoe_iflist. Without this option, - all network interfaces may be used for ATA over Ethernet. Here is a - usage example for the module parameter. - - modprobe aoe_iflist="eth1 eth3" - - The aoe_deadsecs module parameter determines the maximum number of - seconds that the driver will wait for an AoE device to provide a - response to an AoE command. After aoe_deadsecs seconds have - elapsed, the AoE device will be marked as "down". A value of zero - is supported for testing purposes and makes the aoe driver keep - trying AoE commands forever. - - The aoe_maxout module parameter has a default of 128. This is the - maximum number of unresponded packets that will be sent to an AoE - target at one time. - - The aoe_dyndevs module parameter defaults to 1, meaning that the - driver will assign a block device minor number to a discovered AoE - target based on the order of its discovery. With dynamic minor - device numbers in use, a greater range of AoE shelf and slot - addresses can be supported. Users with udev will never have to - think about minor numbers. Using aoe_dyndevs=0 allows device nodes - to be pre-created using a static minor-number scheme with the - aoe-mkshelf script in the aoetools. diff --git a/Documentation/aoe/examples.rst b/Documentation/aoe/examples.rst new file mode 100644 index 000000000000..91f3198e52c1 --- /dev/null +++ b/Documentation/aoe/examples.rst @@ -0,0 +1,23 @@ +Example of udev rules +--------------------- + + .. include:: udev.txt + :literal: + +Example of udev install rules script +------------------------------------ + + .. literalinclude:: udev-install.sh + :language: shell + +Example script to get status +---------------------------- + + .. literalinclude:: status.sh + :language: shell + +Example of AoE autoload script +------------------------------ + + .. literalinclude:: autoload.sh + :language: shell diff --git a/Documentation/aoe/index.rst b/Documentation/aoe/index.rst new file mode 100644 index 000000000000..4394b9b7913c --- /dev/null +++ b/Documentation/aoe/index.rst @@ -0,0 +1,19 @@ +:orphan: + +======================= +ATA over Ethernet (AoE) +======================= + +.. toctree:: + :maxdepth: 1 + + aoe + todo + examples + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/aoe/todo.rst b/Documentation/aoe/todo.rst new file mode 100644 index 000000000000..dea8db5a33e1 --- /dev/null +++ b/Documentation/aoe/todo.rst @@ -0,0 +1,17 @@ +TODO +==== + +There is a potential for deadlock when allocating a struct sk_buff for +data that needs to be written out to aoe storage. If the data is +being written from a dirty page in order to free that page, and if +there are no other pages available, then deadlock may occur when a +free page is needed for the sk_buff allocation. This situation has +not been observed, but it would be nice to eliminate any potential for +deadlock under memory pressure. + +Because ATA over Ethernet is not fragmented by the kernel's IP code, +the destructor member of the struct sk_buff is available to the aoe +driver. By using a mempool for allocating all but the first few +sk_buffs, and by registering a destructor, we should be able to +efficiently allocate sk_buffs without introducing any potential for +deadlock. diff --git a/Documentation/aoe/todo.txt b/Documentation/aoe/todo.txt deleted file mode 100644 index c09dfad4aed8..000000000000 --- a/Documentation/aoe/todo.txt +++ /dev/null @@ -1,14 +0,0 @@ -There is a potential for deadlock when allocating a struct sk_buff for -data that needs to be written out to aoe storage. If the data is -being written from a dirty page in order to free that page, and if -there are no other pages available, then deadlock may occur when a -free page is needed for the sk_buff allocation. This situation has -not been observed, but it would be nice to eliminate any potential for -deadlock under memory pressure. - -Because ATA over Ethernet is not fragmented by the kernel's IP code, -the destructor member of the struct sk_buff is available to the aoe -driver. By using a mempool for allocating all but the first few -sk_buffs, and by registering a destructor, we should be able to -efficiently allocate sk_buffs without introducing any potential for -deadlock. diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt index 1f06daf03f5b..54feda5a0772 100644 --- a/Documentation/aoe/udev.txt +++ b/Documentation/aoe/udev.txt @@ -11,7 +11,7 @@ # udev_rules="/etc/udev/rules.d/" # bash# ls /etc/udev/rules.d/ # 10-wacom.rules 50-udev.rules -# bash# cp /path/to/linux-2.6.xx/Documentation/aoe/udev.txt \ +# bash# cp /path/to/linux/Documentation/aoe/udev.txt \ # /etc/udev/rules.d/60-aoe.rules # -- cgit From b693d0b372afb39432e1c49ad7b3454855bc6bed Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:38 -0300 Subject: docs: arm64: convert docs to ReST and rename to .rst The documentation is in a format that is very close to ReST format. The conversion is actually: - add blank lines in order to identify paragraphs; - fixing tables markups; - adding some lists markups; - marking literal blocks; - adjust some title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/arm64/acpi_object_usage.rst | 738 +++++++++++++++++++++ Documentation/arm64/acpi_object_usage.txt | 622 ----------------- Documentation/arm64/arm-acpi.rst | 528 +++++++++++++++ Documentation/arm64/arm-acpi.txt | 519 --------------- Documentation/arm64/booting.rst | 293 ++++++++ Documentation/arm64/booting.txt | 266 -------- Documentation/arm64/cpu-feature-registers.rst | 304 +++++++++ Documentation/arm64/cpu-feature-registers.txt | 296 --------- Documentation/arm64/elf_hwcaps.rst | 201 ++++++ Documentation/arm64/elf_hwcaps.txt | 231 ------- Documentation/arm64/hugetlbpage.rst | 41 ++ Documentation/arm64/hugetlbpage.txt | 38 -- Documentation/arm64/index.rst | 28 + Documentation/arm64/legacy_instructions.rst | 68 ++ Documentation/arm64/legacy_instructions.txt | 57 -- Documentation/arm64/memory.rst | 98 +++ Documentation/arm64/memory.txt | 97 --- Documentation/arm64/pointer-authentication.rst | 109 +++ Documentation/arm64/pointer-authentication.txt | 107 --- Documentation/arm64/silicon-errata.rst | 131 ++++ Documentation/arm64/silicon-errata.txt | 88 --- Documentation/arm64/sve.rst | 529 +++++++++++++++ Documentation/arm64/sve.txt | 525 --------------- Documentation/arm64/tagged-pointers.rst | 68 ++ Documentation/arm64/tagged-pointers.txt | 66 -- Documentation/translations/zh_CN/arm64/booting.txt | 4 +- .../zh_CN/arm64/legacy_instructions.txt | 4 +- Documentation/translations/zh_CN/arm64/memory.txt | 4 +- .../translations/zh_CN/arm64/silicon-errata.txt | 4 +- .../translations/zh_CN/arm64/tagged-pointers.txt | 4 +- Documentation/virtual/kvm/api.txt | 2 +- 31 files changed, 3147 insertions(+), 2923 deletions(-) create mode 100644 Documentation/arm64/acpi_object_usage.rst delete mode 100644 Documentation/arm64/acpi_object_usage.txt create mode 100644 Documentation/arm64/arm-acpi.rst delete mode 100644 Documentation/arm64/arm-acpi.txt create mode 100644 Documentation/arm64/booting.rst delete mode 100644 Documentation/arm64/booting.txt create mode 100644 Documentation/arm64/cpu-feature-registers.rst delete mode 100644 Documentation/arm64/cpu-feature-registers.txt create mode 100644 Documentation/arm64/elf_hwcaps.rst delete mode 100644 Documentation/arm64/elf_hwcaps.txt create mode 100644 Documentation/arm64/hugetlbpage.rst delete mode 100644 Documentation/arm64/hugetlbpage.txt create mode 100644 Documentation/arm64/index.rst create mode 100644 Documentation/arm64/legacy_instructions.rst delete mode 100644 Documentation/arm64/legacy_instructions.txt create mode 100644 Documentation/arm64/memory.rst delete mode 100644 Documentation/arm64/memory.txt create mode 100644 Documentation/arm64/pointer-authentication.rst delete mode 100644 Documentation/arm64/pointer-authentication.txt create mode 100644 Documentation/arm64/silicon-errata.rst delete mode 100644 Documentation/arm64/silicon-errata.txt create mode 100644 Documentation/arm64/sve.rst delete mode 100644 Documentation/arm64/sve.txt create mode 100644 Documentation/arm64/tagged-pointers.rst delete mode 100644 Documentation/arm64/tagged-pointers.txt (limited to 'Documentation') diff --git a/Documentation/arm64/acpi_object_usage.rst b/Documentation/arm64/acpi_object_usage.rst new file mode 100644 index 000000000000..d51b69dc624d --- /dev/null +++ b/Documentation/arm64/acpi_object_usage.rst @@ -0,0 +1,738 @@ +=========== +ACPI Tables +=========== + +The expectations of individual ACPI tables are discussed in the list that +follows. + +If a section number is used, it refers to a section number in the ACPI +specification where the object is defined. If "Signature Reserved" is used, +the table signature (the first four bytes of the table) is the only portion +of the table recognized by the specification, and the actual table is defined +outside of the UEFI Forum (see Section 5.2.6 of the specification). + +For ACPI on arm64, tables also fall into the following categories: + + - Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT + + - Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT + + - Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT, + MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO, + TCPA, TPM2, UEFI, XENV + + - Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT, + MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT + +====== ======================================================================== +Table Usage for ARMv8 Linux +====== ======================================================================== +BERT Section 18.3 (signature == "BERT") + + **Boot Error Record Table** + + Must be supplied if RAS support is provided by the platform. It + is recommended this table be supplied. + +BOOT Signature Reserved (signature == "BOOT") + + **simple BOOT flag table** + + Microsoft only table, will not be supported. + +BGRT Section 5.2.22 (signature == "BGRT") + + **Boot Graphics Resource Table** + + Optional, not currently supported, with no real use-case for an + ARM server. + +CPEP Section 5.2.18 (signature == "CPEP") + + **Corrected Platform Error Polling table** + + Optional, not currently supported, and not recommended until such + time as ARM-compatible hardware is available, and the specification + suitably modified. + +CSRT Signature Reserved (signature == "CSRT") + + **Core System Resources Table** + + Optional, not currently supported. + +DBG2 Signature Reserved (signature == "DBG2") + + **DeBuG port table 2** + + License has changed and should be usable. Optional if used instead + of earlycon= on the command line. + +DBGP Signature Reserved (signature == "DBGP") + + **DeBuG Port table** + + Microsoft only table, will not be supported. + +DSDT Section 5.2.11.1 (signature == "DSDT") + + **Differentiated System Description Table** + + A DSDT is required; see also SSDT. + + ACPI tables contain only one DSDT but can contain one or more SSDTs, + which are optional. Each SSDT can only add to the ACPI namespace, + but cannot modify or replace anything in the DSDT. + +DMAR Signature Reserved (signature == "DMAR") + + **DMA Remapping table** + + x86 only table, will not be supported. + +DRTM Signature Reserved (signature == "DRTM") + + **Dynamic Root of Trust for Measurement table** + + Optional, not currently supported. + +ECDT Section 5.2.16 (signature == "ECDT") + + **Embedded Controller Description Table** + + Optional, not currently supported, but could be used on ARM if and + only if one uses the GPE_BIT field to represent an IRQ number, since + there are no GPE blocks defined in hardware reduced mode. This would + need to be modified in the ACPI specification. + +EINJ Section 18.6 (signature == "EINJ") + + **Error Injection table** + + This table is very useful for testing platform response to error + conditions; it allows one to inject an error into the system as + if it had actually occurred. However, this table should not be + shipped with a production system; it should be dynamically loaded + and executed with the ACPICA tools only during testing. + +ERST Section 18.5 (signature == "ERST") + + **Error Record Serialization Table** + + On a platform supports RAS, this table must be supplied if it is not + UEFI-based; if it is UEFI-based, this table may be supplied. When this + table is not present, UEFI run time service will be utilized to save + and retrieve hardware error information to and from a persistent store. + +ETDT Signature Reserved (signature == "ETDT") + + **Event Timer Description Table** + + Obsolete table, will not be supported. + +FACS Section 5.2.10 (signature == "FACS") + + **Firmware ACPI Control Structure** + + It is unlikely that this table will be terribly useful. If it is + provided, the Global Lock will NOT be used since it is not part of + the hardware reduced profile, and only 64-bit address fields will + be considered valid. + +FADT Section 5.2.9 (signature == "FACP") + + **Fixed ACPI Description Table** + Required for arm64. + + + The HW_REDUCED_ACPI flag must be set. All of the fields that are + to be ignored when HW_REDUCED_ACPI is set are expected to be set to + zero. + + If an FACS table is provided, the X_FIRMWARE_CTRL field is to be + used, not FIRMWARE_CTRL. + + If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is + filled in properly - that the PSCI_COMPLIANT flag is set and that + PSCI_USE_HVC is set or unset as needed (see table 5-37). + + For the DSDT that is also required, the X_DSDT field is to be used, + not the DSDT field. + +FPDT Section 5.2.23 (signature == "FPDT") + + **Firmware Performance Data Table** + + Optional, not currently supported. + +GTDT Section 5.2.24 (signature == "GTDT") + + **Generic Timer Description Table** + + Required for arm64. + +HEST Section 18.3.2 (signature == "HEST") + + **Hardware Error Source Table** + + ARM-specific error sources have been defined; please use those or the + PCI types such as type 6 (AER Root Port), 7 (AER Endpoint), or 8 (AER + Bridge), or use type 9 (Generic Hardware Error Source). Firmware first + error handling is possible if and only if Trusted Firmware is being + used on arm64. + + Must be supplied if RAS support is provided by the platform. It + is recommended this table be supplied. + +HPET Signature Reserved (signature == "HPET") + + **High Precision Event timer Table** + + x86 only table, will not be supported. + +IBFT Signature Reserved (signature == "IBFT") + + **iSCSI Boot Firmware Table** + + Microsoft defined table, support TBD. + +IORT Signature Reserved (signature == "IORT") + + **Input Output Remapping Table** + + arm64 only table, required in order to describe IO topology, SMMUs, + and GIC ITSs, and how those various components are connected together, + such as identifying which components are behind which SMMUs/ITSs. + This table will only be required on certain SBSA platforms (e.g., + when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it + remains optional. + +IVRS Signature Reserved (signature == "IVRS") + + **I/O Virtualization Reporting Structure** + + x86_64 (AMD) only table, will not be supported. + +LPIT Signature Reserved (signature == "LPIT") + + **Low Power Idle Table** + + x86 only table as of ACPI 5.1; starting with ACPI 6.0, processor + descriptions and power states on ARM platforms should use the DSDT + and define processor container devices (_HID ACPI0010, Section 8.4, + and more specifically 8.4.3 and and 8.4.4). + +MADT Section 5.2.12 (signature == "APIC") + + **Multiple APIC Description Table** + + Required for arm64. Only the GIC interrupt controller structures + should be used (types 0xA - 0xF). + +MCFG Signature Reserved (signature == "MCFG") + + **Memory-mapped ConFiGuration space** + + If the platform supports PCI/PCIe, an MCFG table is required. + +MCHI Signature Reserved (signature == "MCHI") + + **Management Controller Host Interface table** + + Optional, not currently supported. + +MPST Section 5.2.21 (signature == "MPST") + + **Memory Power State Table** + + Optional, not currently supported. + +MSCT Section 5.2.19 (signature == "MSCT") + + **Maximum System Characteristic Table** + + Optional, not currently supported. + +MSDM Signature Reserved (signature == "MSDM") + + **Microsoft Data Management table** + + Microsoft only table, will not be supported. + +NFIT Section 5.2.25 (signature == "NFIT") + + **NVDIMM Firmware Interface Table** + + Optional, not currently supported. + +OEMx Signature of "OEMx" only + + **OEM Specific Tables** + + All tables starting with a signature of "OEM" are reserved for OEM + use. Since these are not meant to be of general use but are limited + to very specific end users, they are not recommended for use and are + not supported by the kernel for arm64. + +PCCT Section 14.1 (signature == "PCCT) + + **Platform Communications Channel Table** + + Recommend for use on arm64; use of PCC is recommended when using CPPC + to control performance and power for platform processors. + +PMTT Section 5.2.21.12 (signature == "PMTT") + + **Platform Memory Topology Table** + + Optional, not currently supported. + +PSDT Section 5.2.11.3 (signature == "PSDT") + + **Persistent System Description Table** + + Obsolete table, will not be supported. + +RASF Section 5.2.20 (signature == "RASF") + + **RAS Feature table** + + Optional, not currently supported. + +RSDP Section 5.2.5 (signature == "RSD PTR") + + **Root System Description PoinTeR** + + Required for arm64. + +RSDT Section 5.2.7 (signature == "RSDT") + + **Root System Description Table** + + Since this table can only provide 32-bit addresses, it is deprecated + on arm64, and will not be used. If provided, it will be ignored. + +SBST Section 5.2.14 (signature == "SBST") + + **Smart Battery Subsystem Table** + + Optional, not currently supported. + +SLIC Signature Reserved (signature == "SLIC") + + **Software LIcensing table** + + Microsoft only table, will not be supported. + +SLIT Section 5.2.17 (signature == "SLIT") + + **System Locality distance Information Table** + + Optional in general, but required for NUMA systems. + +SPCR Signature Reserved (signature == "SPCR") + + **Serial Port Console Redirection table** + + Required for arm64. + +SPMI Signature Reserved (signature == "SPMI") + + **Server Platform Management Interface table** + + Optional, not currently supported. + +SRAT Section 5.2.16 (signature == "SRAT") + + **System Resource Affinity Table** + + Optional, but if used, only the GICC Affinity structures are read. + To support arm64 NUMA, this table is required. + +SSDT Section 5.2.11.2 (signature == "SSDT") + + **Secondary System Description Table** + + These tables are a continuation of the DSDT; these are recommended + for use with devices that can be added to a running system, but can + also serve the purpose of dividing up device descriptions into more + manageable pieces. + + An SSDT can only ADD to the ACPI namespace. It cannot modify or + replace existing device descriptions already in the namespace. + + These tables are optional, however. ACPI tables should contain only + one DSDT but can contain many SSDTs. + +STAO Signature Reserved (signature == "STAO") + + **_STA Override table** + + Optional, but only necessary in virtualized environments in order to + hide devices from guest OSs. + +TCPA Signature Reserved (signature == "TCPA") + + **Trusted Computing Platform Alliance table** + + Optional, not currently supported, and may need changes to fully + interoperate with arm64. + +TPM2 Signature Reserved (signature == "TPM2") + + **Trusted Platform Module 2 table** + + Optional, not currently supported, and may need changes to fully + interoperate with arm64. + +UEFI Signature Reserved (signature == "UEFI") + + **UEFI ACPI data table** + + Optional, not currently supported. No known use case for arm64, + at present. + +WAET Signature Reserved (signature == "WAET") + + **Windows ACPI Emulated devices Table** + + Microsoft only table, will not be supported. + +WDAT Signature Reserved (signature == "WDAT") + + **Watch Dog Action Table** + + Microsoft only table, will not be supported. + +WDRT Signature Reserved (signature == "WDRT") + + **Watch Dog Resource Table** + + Microsoft only table, will not be supported. + +WPBT Signature Reserved (signature == "WPBT") + + **Windows Platform Binary Table** + + Microsoft only table, will not be supported. + +XENV Signature Reserved (signature == "XENV") + + **Xen project table** + + Optional, used only by Xen at present. + +XSDT Section 5.2.8 (signature == "XSDT") + + **eXtended System Description Table** + + Required for arm64. +====== ======================================================================== + +ACPI Objects +------------ +The expectations on individual ACPI objects that are likely to be used are +shown in the list that follows; any object not explicitly mentioned below +should be used as needed for a particular platform or particular subsystem, +such as power management or PCI. + +===== ================ ======================================================== +Name Section Usage for ARMv8 Linux +===== ================ ======================================================== +_CCA 6.2.17 This method must be defined for all bus masters + on arm64 - there are no assumptions made about + whether such devices are cache coherent or not. + The _CCA value is inherited by all descendants of + these devices so it does not need to be repeated. + Without _CCA on arm64, the kernel does not know what + to do about setting up DMA for the device. + + NB: this method provides default cache coherency + attributes; the presence of an SMMU can be used to + modify that, however. For example, a master could + default to non-coherent, but be made coherent with + the appropriate SMMU configuration (see Table 17 of + the IORT specification, ARM Document DEN 0049B). + +_CID 6.1.2 Use as needed, see also _HID. + +_CLS 6.1.3 Use as needed, see also _HID. + +_CPC 8.4.7.1 Use as needed, power management specific. CPPC is + recommended on arm64. + +_CRS 6.2.2 Required on arm64. + +_CSD 8.4.2.2 Use as needed, used only in conjunction with _CST. + +_CST 8.4.2.1 Low power idle states (8.4.4) are recommended instead + of C-states. + +_DDN 6.1.4 This field can be used for a device name. However, + it is meant for DOS device names (e.g., COM1), so be + careful of its use across OSes. + +_DSD 6.2.5 To be used with caution. If this object is used, try + to use it within the constraints already defined by the + Device Properties UUID. Only in rare circumstances + should it be necessary to create a new _DSD UUID. + + In either case, submit the _DSD definition along with + any driver patches for discussion, especially when + device properties are used. A driver will not be + considered complete without a corresponding _DSD + description. Once approved by kernel maintainers, + the UUID or device properties must then be registered + with the UEFI Forum; this may cause some iteration as + more than one OS will be registering entries. + +_DSM 9.1.1 Do not use this method. It is not standardized, the + return values are not well documented, and it is + currently a frequent source of error. + +\_GL 5.7.1 This object is not to be used in hardware reduced + mode, and therefore should not be used on arm64. + +_GLK 6.5.7 This object requires a global lock be defined; there + is no global lock on arm64 since it runs in hardware + reduced mode. Hence, do not use this object on arm64. + +\_GPE 5.3.1 This namespace is for x86 use only. Do not use it + on arm64. + +_HID 6.1.5 This is the primary object to use in device probing, + though _CID and _CLS may also be used. + +_INI 6.5.1 Not required, but can be useful in setting up devices + when UEFI leaves them in a state that may not be what + the driver expects before it starts probing. + +_LPI 8.4.4.3 Recommended for use with processor definitions (_HID + ACPI0010) on arm64. See also _RDI. + +_MLS 6.1.7 Highly recommended for use in internationalization. + +_OFF 7.2.2 It is recommended to define this method for any device + that can be turned on or off. + +_ON 7.2.3 It is recommended to define this method for any device + that can be turned on or off. + +\_OS 5.7.3 This method will return "Linux" by default (this is + the value of the macro ACPI_OS_NAME on Linux). The + command line parameter acpi_os= can be used + to set it to some other value. + +_OSC 6.2.11 This method can be a global method in ACPI (i.e., + \_SB._OSC), or it may be associated with a specific + device (e.g., \_SB.DEV0._OSC), or both. When used + as a global method, only capabilities published in + the ACPI specification are allowed. When used as + a device-specific method, the process described for + using _DSD MUST be used to create an _OSC definition; + out-of-process use of _OSC is not allowed. That is, + submit the device-specific _OSC usage description as + part of the kernel driver submission, get it approved + by the kernel community, then register it with the + UEFI Forum. + +\_OSI 5.7.2 Deprecated on ARM64. As far as ACPI firmware is + concerned, _OSI is not to be used to determine what + sort of system is being used or what functionality + is provided. The _OSC method is to be used instead. + +_PDC 8.4.1 Deprecated, do not use on arm64. + +\_PIC 5.8.1 The method should not be used. On arm64, the only + interrupt model available is GIC. + +\_PR 5.3.1 This namespace is for x86 use only on legacy systems. + Do not use it on arm64. + +_PRT 6.2.13 Required as part of the definition of all PCI root + devices. + +_PRx 7.3.8-11 Use as needed; power management specific. If _PR0 is + defined, _PR3 must also be defined. + +_PSx 7.3.2-5 Use as needed; power management specific. If _PS0 is + defined, _PS3 must also be defined. If clocks or + regulators need adjusting to be consistent with power + usage, change them in these methods. + +_RDI 8.4.4.4 Recommended for use with processor definitions (_HID + ACPI0010) on arm64. This should only be used in + conjunction with _LPI. + +\_REV 5.7.4 Always returns the latest version of ACPI supported. + +\_SB 5.3.1 Required on arm64; all devices must be defined in this + namespace. + +_SLI 6.2.15 Use is recommended when SLIT table is in use. + +_STA 6.3.7, It is recommended to define this method for any device + 7.2.4 that can be turned on or off. See also the STAO table + that provides overrides to hide devices in virtualized + environments. + +_SRS 6.2.16 Use as needed; see also _PRS. + +_STR 6.1.10 Recommended for conveying device names to end users; + this is preferred over using _DDN. + +_SUB 6.1.9 Use as needed; _HID or _CID are preferred. + +_SUN 6.1.11 Use as needed, but recommended. + +_SWS 7.4.3 Use as needed; power management specific; this may + require specification changes for use on arm64. + +_UID 6.1.12 Recommended for distinguishing devices of the same + class; define it if at all possible. +===== ================ ======================================================== + + + + +ACPI Event Model +---------------- +Do not use GPE block devices; these are not supported in the hardware reduced +profile used by arm64. Since there are no GPE blocks defined for use on ARM +platforms, ACPI events must be signaled differently. + +There are two options: GPIO-signaled interrupts (Section 5.6.5), and +interrupt-signaled events (Section 5.6.9). Interrupt-signaled events are a +new feature in the ACPI 6.1 specification. Either - or both - can be used +on a given platform, and which to use may be dependent of limitations in any +given SoC. If possible, interrupt-signaled events are recommended. + + +ACPI Processor Control +---------------------- +Section 8 of the ACPI specification changed significantly in version 6.0. +Processors should now be defined as Device objects with _HID ACPI0007; do +not use the deprecated Processor statement in ASL. All multiprocessor systems +should also define a hierarchy of processors, done with Processor Container +Devices (see Section 8.4.3.1, _HID ACPI0010); do not use processor aggregator +devices (Section 8.5) to describe processor topology. Section 8.4 of the +specification describes the semantics of these object definitions and how +they interrelate. + +Most importantly, the processor hierarchy defined also defines the low power +idle states that are available to the platform, along with the rules for +determining which processors can be turned on or off and the circumstances +that control that. Without this information, the processors will run in +whatever power state they were left in by UEFI. + +Note too, that the processor Device objects defined and the entries in the +MADT for GICs are expected to be in synchronization. The _UID of the Device +object must correspond to processor IDs used in the MADT. + +It is recommended that CPPC (8.4.5) be used as the primary model for processor +performance control on arm64. C-states and P-states may become available at +some point in the future, but most current design work appears to favor CPPC. + +Further, it is essential that the ARMv8 SoC provide a fully functional +implementation of PSCI; this will be the only mechanism supported by ACPI +to control CPU power state. Booting of secondary CPUs using the ACPI +parking protocol is possible, but discouraged, since only PSCI is supported +for ARM servers. + + +ACPI System Address Map Interfaces +---------------------------------- +In Section 15 of the ACPI specification, several methods are mentioned as +possible mechanisms for conveying memory resource information to the kernel. +For arm64, we will only support UEFI for booting with ACPI, hence the UEFI +GetMemoryMap() boot service is the only mechanism that will be used. + + +ACPI Platform Error Interfaces (APEI) +------------------------------------- +The APEI tables supported are described above. + +APEI requires the equivalent of an SCI and an NMI on ARMv8. The SCI is used +to notify the OSPM of errors that have occurred but can be corrected and the +system can continue correct operation, even if possibly degraded. The NMI is +used to indicate fatal errors that cannot be corrected, and require immediate +attention. + +Since there is no direct equivalent of the x86 SCI or NMI, arm64 handles +these slightly differently. The SCI is handled as a high priority interrupt; +given that these are corrected (or correctable) errors being reported, this +is sufficient. The NMI is emulated as the highest priority interrupt +possible. This implies some caution must be used since there could be +interrupts at higher privilege levels or even interrupts at the same priority +as the emulated NMI. In Linux, this should not be the case but one should +be aware it could happen. + + +ACPI Objects Not Supported on ARM64 +----------------------------------- +While this may change in the future, there are several classes of objects +that can be defined, but are not currently of general interest to ARM servers. +Some of these objects have x86 equivalents, and may actually make sense in ARM +servers. However, there is either no hardware available at present, or there +may not even be a non-ARM implementation yet. Hence, they are not currently +supported. + +The following classes of objects are not supported: + + - Section 9.2: ambient light sensor devices + + - Section 9.3: battery devices + + - Section 9.4: lids (e.g., laptop lids) + + - Section 9.8.2: IDE controllers + + - Section 9.9: floppy controllers + + - Section 9.10: GPE block devices + + - Section 9.15: PC/AT RTC/CMOS devices + + - Section 9.16: user presence detection devices + + - Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT + + - Section 9.18: time and alarm devices (see 9.15) + + - Section 10: power source and power meter devices + + - Section 11: thermal management + + - Section 12: embedded controllers interface + + - Section 13: SMBus interfaces + + +This also means that there is no support for the following objects: + +==== =========================== ==== ========== +Name Section Name Section +==== =========================== ==== ========== +_ALC 9.3.4 _FDM 9.10.3 +_ALI 9.3.2 _FIX 6.2.7 +_ALP 9.3.6 _GAI 10.4.5 +_ALR 9.3.5 _GHL 10.4.7 +_ALT 9.3.3 _GTM 9.9.2.1.1 +_BCT 10.2.2.10 _LID 9.5.1 +_BDN 6.5.3 _PAI 10.4.4 +_BIF 10.2.2.1 _PCL 10.3.2 +_BIX 10.2.2.1 _PIF 10.3.3 +_BLT 9.2.3 _PMC 10.4.1 +_BMA 10.2.2.4 _PMD 10.4.8 +_BMC 10.2.2.12 _PMM 10.4.3 +_BMD 10.2.2.11 _PRL 10.3.4 +_BMS 10.2.2.5 _PSR 10.3.1 +_BST 10.2.2.6 _PTP 10.4.2 +_BTH 10.2.2.7 _SBS 10.1.3 +_BTM 10.2.2.9 _SHL 10.4.6 +_BTP 10.2.2.8 _STM 9.9.2.1.1 +_DCK 6.5.2 _UPD 9.16.1 +_EC 12.12 _UPP 9.16.2 +_FDE 9.10.1 _WPC 10.5.2 +_FDI 9.10.2 _WPP 10.5.3 +==== =========================== ==== ========== diff --git a/Documentation/arm64/acpi_object_usage.txt b/Documentation/arm64/acpi_object_usage.txt deleted file mode 100644 index c77010c5c1f0..000000000000 --- a/Documentation/arm64/acpi_object_usage.txt +++ /dev/null @@ -1,622 +0,0 @@ -ACPI Tables ------------ -The expectations of individual ACPI tables are discussed in the list that -follows. - -If a section number is used, it refers to a section number in the ACPI -specification where the object is defined. If "Signature Reserved" is used, -the table signature (the first four bytes of the table) is the only portion -of the table recognized by the specification, and the actual table is defined -outside of the UEFI Forum (see Section 5.2.6 of the specification). - -For ACPI on arm64, tables also fall into the following categories: - - -- Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT - - -- Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT - - -- Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT, - MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO, - TCPA, TPM2, UEFI, XENV - - -- Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT, - MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT - -Table Usage for ARMv8 Linux ------ ---------------------------------------------------------------- -BERT Section 18.3 (signature == "BERT") - == Boot Error Record Table == - Must be supplied if RAS support is provided by the platform. It - is recommended this table be supplied. - -BOOT Signature Reserved (signature == "BOOT") - == simple BOOT flag table == - Microsoft only table, will not be supported. - -BGRT Section 5.2.22 (signature == "BGRT") - == Boot Graphics Resource Table == - Optional, not currently supported, with no real use-case for an - ARM server. - -CPEP Section 5.2.18 (signature == "CPEP") - == Corrected Platform Error Polling table == - Optional, not currently supported, and not recommended until such - time as ARM-compatible hardware is available, and the specification - suitably modified. - -CSRT Signature Reserved (signature == "CSRT") - == Core System Resources Table == - Optional, not currently supported. - -DBG2 Signature Reserved (signature == "DBG2") - == DeBuG port table 2 == - License has changed and should be usable. Optional if used instead - of earlycon= on the command line. - -DBGP Signature Reserved (signature == "DBGP") - == DeBuG Port table == - Microsoft only table, will not be supported. - -DSDT Section 5.2.11.1 (signature == "DSDT") - == Differentiated System Description Table == - A DSDT is required; see also SSDT. - - ACPI tables contain only one DSDT but can contain one or more SSDTs, - which are optional. Each SSDT can only add to the ACPI namespace, - but cannot modify or replace anything in the DSDT. - -DMAR Signature Reserved (signature == "DMAR") - == DMA Remapping table == - x86 only table, will not be supported. - -DRTM Signature Reserved (signature == "DRTM") - == Dynamic Root of Trust for Measurement table == - Optional, not currently supported. - -ECDT Section 5.2.16 (signature == "ECDT") - == Embedded Controller Description Table == - Optional, not currently supported, but could be used on ARM if and - only if one uses the GPE_BIT field to represent an IRQ number, since - there are no GPE blocks defined in hardware reduced mode. This would - need to be modified in the ACPI specification. - -EINJ Section 18.6 (signature == "EINJ") - == Error Injection table == - This table is very useful for testing platform response to error - conditions; it allows one to inject an error into the system as - if it had actually occurred. However, this table should not be - shipped with a production system; it should be dynamically loaded - and executed with the ACPICA tools only during testing. - -ERST Section 18.5 (signature == "ERST") - == Error Record Serialization Table == - On a platform supports RAS, this table must be supplied if it is not - UEFI-based; if it is UEFI-based, this table may be supplied. When this - table is not present, UEFI run time service will be utilized to save - and retrieve hardware error information to and from a persistent store. - -ETDT Signature Reserved (signature == "ETDT") - == Event Timer Description Table == - Obsolete table, will not be supported. - -FACS Section 5.2.10 (signature == "FACS") - == Firmware ACPI Control Structure == - It is unlikely that this table will be terribly useful. If it is - provided, the Global Lock will NOT be used since it is not part of - the hardware reduced profile, and only 64-bit address fields will - be considered valid. - -FADT Section 5.2.9 (signature == "FACP") - == Fixed ACPI Description Table == - Required for arm64. - - The HW_REDUCED_ACPI flag must be set. All of the fields that are - to be ignored when HW_REDUCED_ACPI is set are expected to be set to - zero. - - If an FACS table is provided, the X_FIRMWARE_CTRL field is to be - used, not FIRMWARE_CTRL. - - If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is - filled in properly -- that the PSCI_COMPLIANT flag is set and that - PSCI_USE_HVC is set or unset as needed (see table 5-37). - - For the DSDT that is also required, the X_DSDT field is to be used, - not the DSDT field. - -FPDT Section 5.2.23 (signature == "FPDT") - == Firmware Performance Data Table == - Optional, not currently supported. - -GTDT Section 5.2.24 (signature == "GTDT") - == Generic Timer Description Table == - Required for arm64. - -HEST Section 18.3.2 (signature == "HEST") - == Hardware Error Source Table == - ARM-specific error sources have been defined; please use those or the - PCI types such as type 6 (AER Root Port), 7 (AER Endpoint), or 8 (AER - Bridge), or use type 9 (Generic Hardware Error Source). Firmware first - error handling is possible if and only if Trusted Firmware is being - used on arm64. - - Must be supplied if RAS support is provided by the platform. It - is recommended this table be supplied. - -HPET Signature Reserved (signature == "HPET") - == High Precision Event timer Table == - x86 only table, will not be supported. - -IBFT Signature Reserved (signature == "IBFT") - == iSCSI Boot Firmware Table == - Microsoft defined table, support TBD. - -IORT Signature Reserved (signature == "IORT") - == Input Output Remapping Table == - arm64 only table, required in order to describe IO topology, SMMUs, - and GIC ITSs, and how those various components are connected together, - such as identifying which components are behind which SMMUs/ITSs. - This table will only be required on certain SBSA platforms (e.g., - when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it - remains optional. - -IVRS Signature Reserved (signature == "IVRS") - == I/O Virtualization Reporting Structure == - x86_64 (AMD) only table, will not be supported. - -LPIT Signature Reserved (signature == "LPIT") - == Low Power Idle Table == - x86 only table as of ACPI 5.1; starting with ACPI 6.0, processor - descriptions and power states on ARM platforms should use the DSDT - and define processor container devices (_HID ACPI0010, Section 8.4, - and more specifically 8.4.3 and and 8.4.4). - -MADT Section 5.2.12 (signature == "APIC") - == Multiple APIC Description Table == - Required for arm64. Only the GIC interrupt controller structures - should be used (types 0xA - 0xF). - -MCFG Signature Reserved (signature == "MCFG") - == Memory-mapped ConFiGuration space == - If the platform supports PCI/PCIe, an MCFG table is required. - -MCHI Signature Reserved (signature == "MCHI") - == Management Controller Host Interface table == - Optional, not currently supported. - -MPST Section 5.2.21 (signature == "MPST") - == Memory Power State Table == - Optional, not currently supported. - -MSCT Section 5.2.19 (signature == "MSCT") - == Maximum System Characteristic Table == - Optional, not currently supported. - -MSDM Signature Reserved (signature == "MSDM") - == Microsoft Data Management table == - Microsoft only table, will not be supported. - -NFIT Section 5.2.25 (signature == "NFIT") - == NVDIMM Firmware Interface Table == - Optional, not currently supported. - -OEMx Signature of "OEMx" only - == OEM Specific Tables == - All tables starting with a signature of "OEM" are reserved for OEM - use. Since these are not meant to be of general use but are limited - to very specific end users, they are not recommended for use and are - not supported by the kernel for arm64. - -PCCT Section 14.1 (signature == "PCCT) - == Platform Communications Channel Table == - Recommend for use on arm64; use of PCC is recommended when using CPPC - to control performance and power for platform processors. - -PMTT Section 5.2.21.12 (signature == "PMTT") - == Platform Memory Topology Table == - Optional, not currently supported. - -PSDT Section 5.2.11.3 (signature == "PSDT") - == Persistent System Description Table == - Obsolete table, will not be supported. - -RASF Section 5.2.20 (signature == "RASF") - == RAS Feature table == - Optional, not currently supported. - -RSDP Section 5.2.5 (signature == "RSD PTR") - == Root System Description PoinTeR == - Required for arm64. - -RSDT Section 5.2.7 (signature == "RSDT") - == Root System Description Table == - Since this table can only provide 32-bit addresses, it is deprecated - on arm64, and will not be used. If provided, it will be ignored. - -SBST Section 5.2.14 (signature == "SBST") - == Smart Battery Subsystem Table == - Optional, not currently supported. - -SLIC Signature Reserved (signature == "SLIC") - == Software LIcensing table == - Microsoft only table, will not be supported. - -SLIT Section 5.2.17 (signature == "SLIT") - == System Locality distance Information Table == - Optional in general, but required for NUMA systems. - -SPCR Signature Reserved (signature == "SPCR") - == Serial Port Console Redirection table == - Required for arm64. - -SPMI Signature Reserved (signature == "SPMI") - == Server Platform Management Interface table == - Optional, not currently supported. - -SRAT Section 5.2.16 (signature == "SRAT") - == System Resource Affinity Table == - Optional, but if used, only the GICC Affinity structures are read. - To support arm64 NUMA, this table is required. - -SSDT Section 5.2.11.2 (signature == "SSDT") - == Secondary System Description Table == - These tables are a continuation of the DSDT; these are recommended - for use with devices that can be added to a running system, but can - also serve the purpose of dividing up device descriptions into more - manageable pieces. - - An SSDT can only ADD to the ACPI namespace. It cannot modify or - replace existing device descriptions already in the namespace. - - These tables are optional, however. ACPI tables should contain only - one DSDT but can contain many SSDTs. - -STAO Signature Reserved (signature == "STAO") - == _STA Override table == - Optional, but only necessary in virtualized environments in order to - hide devices from guest OSs. - -TCPA Signature Reserved (signature == "TCPA") - == Trusted Computing Platform Alliance table == - Optional, not currently supported, and may need changes to fully - interoperate with arm64. - -TPM2 Signature Reserved (signature == "TPM2") - == Trusted Platform Module 2 table == - Optional, not currently supported, and may need changes to fully - interoperate with arm64. - -UEFI Signature Reserved (signature == "UEFI") - == UEFI ACPI data table == - Optional, not currently supported. No known use case for arm64, - at present. - -WAET Signature Reserved (signature == "WAET") - == Windows ACPI Emulated devices Table == - Microsoft only table, will not be supported. - -WDAT Signature Reserved (signature == "WDAT") - == Watch Dog Action Table == - Microsoft only table, will not be supported. - -WDRT Signature Reserved (signature == "WDRT") - == Watch Dog Resource Table == - Microsoft only table, will not be supported. - -WPBT Signature Reserved (signature == "WPBT") - == Windows Platform Binary Table == - Microsoft only table, will not be supported. - -XENV Signature Reserved (signature == "XENV") - == Xen project table == - Optional, used only by Xen at present. - -XSDT Section 5.2.8 (signature == "XSDT") - == eXtended System Description Table == - Required for arm64. - - -ACPI Objects ------------- -The expectations on individual ACPI objects that are likely to be used are -shown in the list that follows; any object not explicitly mentioned below -should be used as needed for a particular platform or particular subsystem, -such as power management or PCI. - -Name Section Usage for ARMv8 Linux ----- ------------ ------------------------------------------------- -_CCA 6.2.17 This method must be defined for all bus masters - on arm64 -- there are no assumptions made about - whether such devices are cache coherent or not. - The _CCA value is inherited by all descendants of - these devices so it does not need to be repeated. - Without _CCA on arm64, the kernel does not know what - to do about setting up DMA for the device. - - NB: this method provides default cache coherency - attributes; the presence of an SMMU can be used to - modify that, however. For example, a master could - default to non-coherent, but be made coherent with - the appropriate SMMU configuration (see Table 17 of - the IORT specification, ARM Document DEN 0049B). - -_CID 6.1.2 Use as needed, see also _HID. - -_CLS 6.1.3 Use as needed, see also _HID. - -_CPC 8.4.7.1 Use as needed, power management specific. CPPC is - recommended on arm64. - -_CRS 6.2.2 Required on arm64. - -_CSD 8.4.2.2 Use as needed, used only in conjunction with _CST. - -_CST 8.4.2.1 Low power idle states (8.4.4) are recommended instead - of C-states. - -_DDN 6.1.4 This field can be used for a device name. However, - it is meant for DOS device names (e.g., COM1), so be - careful of its use across OSes. - -_DSD 6.2.5 To be used with caution. If this object is used, try - to use it within the constraints already defined by the - Device Properties UUID. Only in rare circumstances - should it be necessary to create a new _DSD UUID. - - In either case, submit the _DSD definition along with - any driver patches for discussion, especially when - device properties are used. A driver will not be - considered complete without a corresponding _DSD - description. Once approved by kernel maintainers, - the UUID or device properties must then be registered - with the UEFI Forum; this may cause some iteration as - more than one OS will be registering entries. - -_DSM 9.1.1 Do not use this method. It is not standardized, the - return values are not well documented, and it is - currently a frequent source of error. - -\_GL 5.7.1 This object is not to be used in hardware reduced - mode, and therefore should not be used on arm64. - -_GLK 6.5.7 This object requires a global lock be defined; there - is no global lock on arm64 since it runs in hardware - reduced mode. Hence, do not use this object on arm64. - -\_GPE 5.3.1 This namespace is for x86 use only. Do not use it - on arm64. - -_HID 6.1.5 This is the primary object to use in device probing, - though _CID and _CLS may also be used. - -_INI 6.5.1 Not required, but can be useful in setting up devices - when UEFI leaves them in a state that may not be what - the driver expects before it starts probing. - -_LPI 8.4.4.3 Recommended for use with processor definitions (_HID - ACPI0010) on arm64. See also _RDI. - -_MLS 6.1.7 Highly recommended for use in internationalization. - -_OFF 7.2.2 It is recommended to define this method for any device - that can be turned on or off. - -_ON 7.2.3 It is recommended to define this method for any device - that can be turned on or off. - -\_OS 5.7.3 This method will return "Linux" by default (this is - the value of the macro ACPI_OS_NAME on Linux). The - command line parameter acpi_os= can be used - to set it to some other value. - -_OSC 6.2.11 This method can be a global method in ACPI (i.e., - \_SB._OSC), or it may be associated with a specific - device (e.g., \_SB.DEV0._OSC), or both. When used - as a global method, only capabilities published in - the ACPI specification are allowed. When used as - a device-specific method, the process described for - using _DSD MUST be used to create an _OSC definition; - out-of-process use of _OSC is not allowed. That is, - submit the device-specific _OSC usage description as - part of the kernel driver submission, get it approved - by the kernel community, then register it with the - UEFI Forum. - -\_OSI 5.7.2 Deprecated on ARM64. As far as ACPI firmware is - concerned, _OSI is not to be used to determine what - sort of system is being used or what functionality - is provided. The _OSC method is to be used instead. - -_PDC 8.4.1 Deprecated, do not use on arm64. - -\_PIC 5.8.1 The method should not be used. On arm64, the only - interrupt model available is GIC. - -\_PR 5.3.1 This namespace is for x86 use only on legacy systems. - Do not use it on arm64. - -_PRT 6.2.13 Required as part of the definition of all PCI root - devices. - -_PRx 7.3.8-11 Use as needed; power management specific. If _PR0 is - defined, _PR3 must also be defined. - -_PSx 7.3.2-5 Use as needed; power management specific. If _PS0 is - defined, _PS3 must also be defined. If clocks or - regulators need adjusting to be consistent with power - usage, change them in these methods. - -_RDI 8.4.4.4 Recommended for use with processor definitions (_HID - ACPI0010) on arm64. This should only be used in - conjunction with _LPI. - -\_REV 5.7.4 Always returns the latest version of ACPI supported. - -\_SB 5.3.1 Required on arm64; all devices must be defined in this - namespace. - -_SLI 6.2.15 Use is recommended when SLIT table is in use. - -_STA 6.3.7, It is recommended to define this method for any device - 7.2.4 that can be turned on or off. See also the STAO table - that provides overrides to hide devices in virtualized - environments. - -_SRS 6.2.16 Use as needed; see also _PRS. - -_STR 6.1.10 Recommended for conveying device names to end users; - this is preferred over using _DDN. - -_SUB 6.1.9 Use as needed; _HID or _CID are preferred. - -_SUN 6.1.11 Use as needed, but recommended. - -_SWS 7.4.3 Use as needed; power management specific; this may - require specification changes for use on arm64. - -_UID 6.1.12 Recommended for distinguishing devices of the same - class; define it if at all possible. - - - - -ACPI Event Model ----------------- -Do not use GPE block devices; these are not supported in the hardware reduced -profile used by arm64. Since there are no GPE blocks defined for use on ARM -platforms, ACPI events must be signaled differently. - -There are two options: GPIO-signaled interrupts (Section 5.6.5), and -interrupt-signaled events (Section 5.6.9). Interrupt-signaled events are a -new feature in the ACPI 6.1 specification. Either -- or both -- can be used -on a given platform, and which to use may be dependent of limitations in any -given SoC. If possible, interrupt-signaled events are recommended. - - -ACPI Processor Control ----------------------- -Section 8 of the ACPI specification changed significantly in version 6.0. -Processors should now be defined as Device objects with _HID ACPI0007; do -not use the deprecated Processor statement in ASL. All multiprocessor systems -should also define a hierarchy of processors, done with Processor Container -Devices (see Section 8.4.3.1, _HID ACPI0010); do not use processor aggregator -devices (Section 8.5) to describe processor topology. Section 8.4 of the -specification describes the semantics of these object definitions and how -they interrelate. - -Most importantly, the processor hierarchy defined also defines the low power -idle states that are available to the platform, along with the rules for -determining which processors can be turned on or off and the circumstances -that control that. Without this information, the processors will run in -whatever power state they were left in by UEFI. - -Note too, that the processor Device objects defined and the entries in the -MADT for GICs are expected to be in synchronization. The _UID of the Device -object must correspond to processor IDs used in the MADT. - -It is recommended that CPPC (8.4.5) be used as the primary model for processor -performance control on arm64. C-states and P-states may become available at -some point in the future, but most current design work appears to favor CPPC. - -Further, it is essential that the ARMv8 SoC provide a fully functional -implementation of PSCI; this will be the only mechanism supported by ACPI -to control CPU power state. Booting of secondary CPUs using the ACPI -parking protocol is possible, but discouraged, since only PSCI is supported -for ARM servers. - - -ACPI System Address Map Interfaces ----------------------------------- -In Section 15 of the ACPI specification, several methods are mentioned as -possible mechanisms for conveying memory resource information to the kernel. -For arm64, we will only support UEFI for booting with ACPI, hence the UEFI -GetMemoryMap() boot service is the only mechanism that will be used. - - -ACPI Platform Error Interfaces (APEI) -------------------------------------- -The APEI tables supported are described above. - -APEI requires the equivalent of an SCI and an NMI on ARMv8. The SCI is used -to notify the OSPM of errors that have occurred but can be corrected and the -system can continue correct operation, even if possibly degraded. The NMI is -used to indicate fatal errors that cannot be corrected, and require immediate -attention. - -Since there is no direct equivalent of the x86 SCI or NMI, arm64 handles -these slightly differently. The SCI is handled as a high priority interrupt; -given that these are corrected (or correctable) errors being reported, this -is sufficient. The NMI is emulated as the highest priority interrupt -possible. This implies some caution must be used since there could be -interrupts at higher privilege levels or even interrupts at the same priority -as the emulated NMI. In Linux, this should not be the case but one should -be aware it could happen. - - -ACPI Objects Not Supported on ARM64 ------------------------------------ -While this may change in the future, there are several classes of objects -that can be defined, but are not currently of general interest to ARM servers. -Some of these objects have x86 equivalents, and may actually make sense in ARM -servers. However, there is either no hardware available at present, or there -may not even be a non-ARM implementation yet. Hence, they are not currently -supported. - -The following classes of objects are not supported: - - -- Section 9.2: ambient light sensor devices - - -- Section 9.3: battery devices - - -- Section 9.4: lids (e.g., laptop lids) - - -- Section 9.8.2: IDE controllers - - -- Section 9.9: floppy controllers - - -- Section 9.10: GPE block devices - - -- Section 9.15: PC/AT RTC/CMOS devices - - -- Section 9.16: user presence detection devices - - -- Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT - - -- Section 9.18: time and alarm devices (see 9.15) - - -- Section 10: power source and power meter devices - - -- Section 11: thermal management - - -- Section 12: embedded controllers interface - - -- Section 13: SMBus interfaces - - -This also means that there is no support for the following objects: - -Name Section Name Section ----- ------------ ---- ------------ -_ALC 9.3.4 _FDM 9.10.3 -_ALI 9.3.2 _FIX 6.2.7 -_ALP 9.3.6 _GAI 10.4.5 -_ALR 9.3.5 _GHL 10.4.7 -_ALT 9.3.3 _GTM 9.9.2.1.1 -_BCT 10.2.2.10 _LID 9.5.1 -_BDN 6.5.3 _PAI 10.4.4 -_BIF 10.2.2.1 _PCL 10.3.2 -_BIX 10.2.2.1 _PIF 10.3.3 -_BLT 9.2.3 _PMC 10.4.1 -_BMA 10.2.2.4 _PMD 10.4.8 -_BMC 10.2.2.12 _PMM 10.4.3 -_BMD 10.2.2.11 _PRL 10.3.4 -_BMS 10.2.2.5 _PSR 10.3.1 -_BST 10.2.2.6 _PTP 10.4.2 -_BTH 10.2.2.7 _SBS 10.1.3 -_BTM 10.2.2.9 _SHL 10.4.6 -_BTP 10.2.2.8 _STM 9.9.2.1.1 -_DCK 6.5.2 _UPD 9.16.1 -_EC 12.12 _UPP 9.16.2 -_FDE 9.10.1 _WPC 10.5.2 -_FDI 9.10.2 _WPP 10.5.3 - diff --git a/Documentation/arm64/arm-acpi.rst b/Documentation/arm64/arm-acpi.rst new file mode 100644 index 000000000000..872dbbc73d4a --- /dev/null +++ b/Documentation/arm64/arm-acpi.rst @@ -0,0 +1,528 @@ +===================== +ACPI on ARMv8 Servers +===================== + +ACPI can be used for ARMv8 general purpose servers designed to follow +the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server +Base Boot Requirements) [1] specifications. Please note that the SBBR +can be retrieved simply by visiting [1], but the SBSA is currently only +available to those with an ARM login due to ARM IP licensing concerns. + +The ARMv8 kernel implements the reduced hardware model of ACPI version +5.1 or later. Links to the specification and all external documents +it refers to are managed by the UEFI Forum. The specification is +available at http://www.uefi.org/specifications and documents referenced +by the specification can be found via http://www.uefi.org/acpi. + +If an ARMv8 system does not meet the requirements of the SBSA and SBBR, +or cannot be described using the mechanisms defined in the required ACPI +specifications, then ACPI may not be a good fit for the hardware. + +While the documents mentioned above set out the requirements for building +industry-standard ARMv8 servers, they also apply to more than one operating +system. The purpose of this document is to describe the interaction between +ACPI and Linux only, on an ARMv8 system -- that is, what Linux expects of +ACPI and what ACPI can expect of Linux. + + +Why ACPI on ARM? +---------------- +Before examining the details of the interface between ACPI and Linux, it is +useful to understand why ACPI is being used. Several technologies already +exist in Linux for describing non-enumerable hardware, after all. In this +section we summarize a blog post [2] from Grant Likely that outlines the +reasoning behind ACPI on ARMv8 servers. Actually, we snitch a good portion +of the summary text almost directly, to be honest. + +The short form of the rationale for ACPI on ARM is: + +- ACPI’s byte code (AML) allows the platform to encode hardware behavior, + while DT explicitly does not support this. For hardware vendors, being + able to encode behavior is a key tool used in supporting operating + system releases on new hardware. + +- ACPI’s OSPM defines a power management model that constrains what the + platform is allowed to do into a specific model, while still providing + flexibility in hardware design. + +- In the enterprise server environment, ACPI has established bindings (such + as for RAS) which are currently used in production systems. DT does not. + Such bindings could be defined in DT at some point, but doing so means ARM + and x86 would end up using completely different code paths in both firmware + and the kernel. + +- Choosing a single interface to describe the abstraction between a platform + and an OS is important. Hardware vendors would not be required to implement + both DT and ACPI if they want to support multiple operating systems. And, + agreeing on a single interface instead of being fragmented into per OS + interfaces makes for better interoperability overall. + +- The new ACPI governance process works well and Linux is now at the same + table as hardware vendors and other OS vendors. In fact, there is no + longer any reason to feel that ACPI only belongs to Windows or that + Linux is in any way secondary to Microsoft in this arena. The move of + ACPI governance into the UEFI forum has significantly opened up the + specification development process, and currently, a large portion of the + changes being made to ACPI are being driven by Linux. + +Key to the use of ACPI is the support model. For servers in general, the +responsibility for hardware behaviour cannot solely be the domain of the +kernel, but rather must be split between the platform and the kernel, in +order to allow for orderly change over time. ACPI frees the OS from needing +to understand all the minute details of the hardware so that the OS doesn’t +need to be ported to each and every device individually. It allows the +hardware vendors to take responsibility for power management behaviour without +depending on an OS release cycle which is not under their control. + +ACPI is also important because hardware and OS vendors have already worked +out the mechanisms for supporting a general purpose computing ecosystem. The +infrastructure is in place, the bindings are in place, and the processes are +in place. DT does exactly what Linux needs it to when working with vertically +integrated devices, but there are no good processes for supporting what the +server vendors need. Linux could potentially get there with DT, but doing so +really just duplicates something that already works. ACPI already does what +the hardware vendors need, Microsoft won’t collaborate on DT, and hardware +vendors would still end up providing two completely separate firmware +interfaces -- one for Linux and one for Windows. + + +Kernel Compatibility +-------------------- +One of the primary motivations for ACPI is standardization, and using that +to provide backward compatibility for Linux kernels. In the server market, +software and hardware are often used for long periods. ACPI allows the +kernel and firmware to agree on a consistent abstraction that can be +maintained over time, even as hardware or software change. As long as the +abstraction is supported, systems can be updated without necessarily having +to replace the kernel. + +When a Linux driver or subsystem is first implemented using ACPI, it by +definition ends up requiring a specific version of the ACPI specification +-- it's baseline. ACPI firmware must continue to work, even though it may +not be optimal, with the earliest kernel version that first provides support +for that baseline version of ACPI. There may be a need for additional drivers, +but adding new functionality (e.g., CPU power management) should not break +older kernel versions. Further, ACPI firmware must also work with the most +recent version of the kernel. + + +Relationship with Device Tree +----------------------------- +ACPI support in drivers and subsystems for ARMv8 should never be mutually +exclusive with DT support at compile time. + +At boot time the kernel will only use one description method depending on +parameters passed from the boot loader (including kernel bootargs). + +Regardless of whether DT or ACPI is used, the kernel must always be capable +of booting with either scheme (in kernels with both schemes enabled at compile +time). + + +Booting using ACPI tables +------------------------- +The only defined method for passing ACPI tables to the kernel on ARMv8 +is via the UEFI system configuration table. Just so it is explicit, this +means that ACPI is only supported on platforms that boot via UEFI. + +When an ARMv8 system boots, it can either have DT information, ACPI tables, +or in some very unusual cases, both. If no command line parameters are used, +the kernel will try to use DT for device enumeration; if there is no DT +present, the kernel will try to use ACPI tables, but only if they are present. +In neither is available, the kernel will not boot. If acpi=force is used +on the command line, the kernel will attempt to use ACPI tables first, but +fall back to DT if there are no ACPI tables present. The basic idea is that +the kernel will not fail to boot unless it absolutely has no other choice. + +Processing of ACPI tables may be disabled by passing acpi=off on the kernel +command line; this is the default behavior. + +In order for the kernel to load and use ACPI tables, the UEFI implementation +MUST set the ACPI_20_TABLE_GUID to point to the RSDP table (the table with +the ACPI signature "RSD PTR "). If this pointer is incorrect and acpi=force +is used, the kernel will disable ACPI and try to use DT to boot instead; the +kernel has, in effect, determined that ACPI tables are not present at that +point. + +If the pointer to the RSDP table is correct, the table will be mapped into +the kernel by the ACPI core, using the address provided by UEFI. + +The ACPI core will then locate and map in all other ACPI tables provided by +using the addresses in the RSDP table to find the XSDT (eXtended System +Description Table). The XSDT in turn provides the addresses to all other +ACPI tables provided by the system firmware; the ACPI core will then traverse +this table and map in the tables listed. + +The ACPI core will ignore any provided RSDT (Root System Description Table). +RSDTs have been deprecated and are ignored on arm64 since they only allow +for 32-bit addresses. + +Further, the ACPI core will only use the 64-bit address fields in the FADT +(Fixed ACPI Description Table). Any 32-bit address fields in the FADT will +be ignored on arm64. + +Hardware reduced mode (see Section 4.1 of the ACPI 6.1 specification) will +be enforced by the ACPI core on arm64. Doing so allows the ACPI core to +run less complex code since it no longer has to provide support for legacy +hardware from other architectures. Any fields that are not to be used for +hardware reduced mode must be set to zero. + +For the ACPI core to operate properly, and in turn provide the information +the kernel needs to configure devices, it expects to find the following +tables (all section numbers refer to the ACPI 6.1 specification): + + - RSDP (Root System Description Pointer), section 5.2.5 + + - XSDT (eXtended System Description Table), section 5.2.8 + + - FADT (Fixed ACPI Description Table), section 5.2.9 + + - DSDT (Differentiated System Description Table), section + 5.2.11.1 + + - MADT (Multiple APIC Description Table), section 5.2.12 + + - GTDT (Generic Timer Description Table), section 5.2.24 + + - If PCI is supported, the MCFG (Memory mapped ConFiGuration + Table), section 5.2.6, specifically Table 5-31. + + - If booting without a console= kernel parameter is + supported, the SPCR (Serial Port Console Redirection table), + section 5.2.6, specifically Table 5-31. + + - If necessary to describe the I/O topology, SMMUs and GIC ITSs, + the IORT (Input Output Remapping Table, section 5.2.6, specifically + Table 5-31). + + - If NUMA is supported, the SRAT (System Resource Affinity Table) + and SLIT (System Locality distance Information Table), sections + 5.2.16 and 5.2.17, respectively. + +If the above tables are not all present, the kernel may or may not be +able to boot properly since it may not be able to configure all of the +devices available. This list of tables is not meant to be all inclusive; +in some environments other tables may be needed (e.g., any of the APEI +tables from section 18) to support specific functionality. + + +ACPI Detection +-------------- +Drivers should determine their probe() type by checking for a null +value for ACPI_HANDLE, or checking .of_node, or other information in +the device structure. This is detailed further in the "Driver +Recommendations" section. + +In non-driver code, if the presence of ACPI needs to be detected at +run time, then check the value of acpi_disabled. If CONFIG_ACPI is not +set, acpi_disabled will always be 1. + + +Device Enumeration +------------------ +Device descriptions in ACPI should use standard recognized ACPI interfaces. +These may contain less information than is typically provided via a Device +Tree description for the same device. This is also one of the reasons that +ACPI can be useful -- the driver takes into account that it may have less +detailed information about the device and uses sensible defaults instead. +If done properly in the driver, the hardware can change and improve over +time without the driver having to change at all. + +Clocks provide an excellent example. In DT, clocks need to be specified +and the drivers need to take them into account. In ACPI, the assumption +is that UEFI will leave the device in a reasonable default state, including +any clock settings. If for some reason the driver needs to change a clock +value, this can be done in an ACPI method; all the driver needs to do is +invoke the method and not concern itself with what the method needs to do +to change the clock. Changing the hardware can then take place over time +by changing what the ACPI method does, and not the driver. + +In DT, the parameters needed by the driver to set up clocks as in the example +above are known as "bindings"; in ACPI, these are known as "Device Properties" +and provided to a driver via the _DSD object. + +ACPI tables are described with a formal language called ASL, the ACPI +Source Language (section 19 of the specification). This means that there +are always multiple ways to describe the same thing -- including device +properties. For example, device properties could use an ASL construct +that looks like this: Name(KEY0, "value0"). An ACPI device driver would +then retrieve the value of the property by evaluating the KEY0 object. +However, using Name() this way has multiple problems: (1) ACPI limits +names ("KEY0") to four characters unlike DT; (2) there is no industry +wide registry that maintains a list of names, minimizing re-use; (3) +there is also no registry for the definition of property values ("value0"), +again making re-use difficult; and (4) how does one maintain backward +compatibility as new hardware comes out? The _DSD method was created +to solve precisely these sorts of problems; Linux drivers should ALWAYS +use the _DSD method for device properties and nothing else. + +The _DSM object (ACPI Section 9.14.1) could also be used for conveying +device properties to a driver. Linux drivers should only expect it to +be used if _DSD cannot represent the data required, and there is no way +to create a new UUID for the _DSD object. Note that there is even less +regulation of the use of _DSM than there is of _DSD. Drivers that depend +on the contents of _DSM objects will be more difficult to maintain over +time because of this; as of this writing, the use of _DSM is the cause +of quite a few firmware problems and is not recommended. + +Drivers should look for device properties in the _DSD object ONLY; the _DSD +object is described in the ACPI specification section 6.2.5, but this only +describes how to define the structure of an object returned via _DSD, and +how specific data structures are defined by specific UUIDs. Linux should +only use the _DSD Device Properties UUID [5]: + + - UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 + + - http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf + +The UEFI Forum provides a mechanism for registering device properties [4] +so that they may be used across all operating systems supporting ACPI. +Device properties that have not been registered with the UEFI Forum should +not be used. + +Before creating new device properties, check to be sure that they have not +been defined before and either registered in the Linux kernel documentation +as DT bindings, or the UEFI Forum as device properties. While we do not want +to simply move all DT bindings into ACPI device properties, we can learn from +what has been previously defined. + +If it is necessary to define a new device property, or if it makes sense to +synthesize the definition of a binding so it can be used in any firmware, +both DT bindings and ACPI device properties for device drivers have review +processes. Use them both. When the driver itself is submitted for review +to the Linux mailing lists, the device property definitions needed must be +submitted at the same time. A driver that supports ACPI and uses device +properties will not be considered complete without their definitions. Once +the device property has been accepted by the Linux community, it must be +registered with the UEFI Forum [4], which will review it again for consistency +within the registry. This may require iteration. The UEFI Forum, though, +will always be the canonical site for device property definitions. + +It may make sense to provide notice to the UEFI Forum that there is the +intent to register a previously unused device property name as a means of +reserving the name for later use. Other operating system vendors will +also be submitting registration requests and this may help smooth the +process. + +Once registration and review have been completed, the kernel provides an +interface for looking up device properties in a manner independent of +whether DT or ACPI is being used. This API should be used [6]; it can +eliminate some duplication of code paths in driver probing functions and +discourage divergence between DT bindings and ACPI device properties. + + +Programmable Power Control Resources +------------------------------------ +Programmable power control resources include such resources as voltage/current +providers (regulators) and clock sources. + +With ACPI, the kernel clock and regulator framework is not expected to be used +at all. + +The kernel assumes that power control of these resources is represented with +Power Resource Objects (ACPI section 7.1). The ACPI core will then handle +correctly enabling and disabling resources as they are needed. In order to +get that to work, ACPI assumes each device has defined D-states and that these +can be controlled through the optional ACPI methods _PS0, _PS1, _PS2, and _PS3; +in ACPI, _PS0 is the method to invoke to turn a device full on, and _PS3 is for +turning a device full off. + +There are two options for using those Power Resources. They can: + + - be managed in a _PSx method which gets called on entry to power + state Dx. + + - be declared separately as power resources with their own _ON and _OFF + methods. They are then tied back to D-states for a particular device + via _PRx which specifies which power resources a device needs to be on + while in Dx. Kernel then tracks number of devices using a power resource + and calls _ON/_OFF as needed. + +The kernel ACPI code will also assume that the _PSx methods follow the normal +ACPI rules for such methods: + + - If either _PS0 or _PS3 is implemented, then the other method must also + be implemented. + + - If a device requires usage or setup of a power resource when on, the ASL + should organize that it is allocated/enabled using the _PS0 method. + + - Resources allocated or enabled in the _PS0 method should be disabled + or de-allocated in the _PS3 method. + + - Firmware will leave the resources in a reasonable state before handing + over control to the kernel. + +Such code in _PSx methods will of course be very platform specific. But, +this allows the driver to abstract out the interface for operating the device +and avoid having to read special non-standard values from ACPI tables. Further, +abstracting the use of these resources allows the hardware to change over time +without requiring updates to the driver. + + +Clocks +------ +ACPI makes the assumption that clocks are initialized by the firmware -- +UEFI, in this case -- to some working value before control is handed over +to the kernel. This has implications for devices such as UARTs, or SoC-driven +LCD displays, for example. + +When the kernel boots, the clocks are assumed to be set to reasonable +working values. If for some reason the frequency needs to change -- e.g., +throttling for power management -- the device driver should expect that +process to be abstracted out into some ACPI method that can be invoked +(please see the ACPI specification for further recommendations on standard +methods to be expected). The only exceptions to this are CPU clocks where +CPPC provides a much richer interface than ACPI methods. If the clocks +are not set, there is no direct way for Linux to control them. + +If an SoC vendor wants to provide fine-grained control of the system clocks, +they could do so by providing ACPI methods that could be invoked by Linux +drivers. However, this is NOT recommended and Linux drivers should NOT use +such methods, even if they are provided. Such methods are not currently +standardized in the ACPI specification, and using them could tie a kernel +to a very specific SoC, or tie an SoC to a very specific version of the +kernel, both of which we are trying to avoid. + + +Driver Recommendations +---------------------- +DO NOT remove any DT handling when adding ACPI support for a driver. The +same device may be used on many different systems. + +DO try to structure the driver so that it is data-driven. That is, set up +a struct containing internal per-device state based on defaults and whatever +else must be discovered by the driver probe function. Then, have the rest +of the driver operate off of the contents of that struct. Doing so should +allow most divergence between ACPI and DT functionality to be kept local to +the probe function instead of being scattered throughout the driver. For +example:: + + static int device_probe_dt(struct platform_device *pdev) + { + /* DT specific functionality */ + ... + } + + static int device_probe_acpi(struct platform_device *pdev) + { + /* ACPI specific functionality */ + ... + } + + static int device_probe(struct platform_device *pdev) + { + ... + struct device_node node = pdev->dev.of_node; + ... + + if (node) + ret = device_probe_dt(pdev); + else if (ACPI_HANDLE(&pdev->dev)) + ret = device_probe_acpi(pdev); + else + /* other initialization */ + ... + /* Continue with any generic probe operations */ + ... + } + +DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it +clear the different names the driver is probed for, both from DT and from +ACPI:: + + static struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + { } + }; + MODULE_DEVICE_TABLE(of, virtio_mmio_match); + + static const struct acpi_device_id virtio_mmio_acpi_match[] = { + { "LNRO0005", }, + { } + }; + MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); + + +ASWG +---- +The ACPI specification changes regularly. During the year 2014, for instance, +version 5.1 was released and version 6.0 substantially completed, with most of +the changes being driven by ARM-specific requirements. Proposed changes are +presented and discussed in the ASWG (ACPI Specification Working Group) which +is a part of the UEFI Forum. The current version of the ACPI specification +is 6.1 release in January 2016. + +Participation in this group is open to all UEFI members. Please see +http://www.uefi.org/workinggroup for details on group membership. + +It is the intent of the ARMv8 ACPI kernel code to follow the ACPI specification +as closely as possible, and to only implement functionality that complies with +the released standards from UEFI ASWG. As a practical matter, there will be +vendors that provide bad ACPI tables or violate the standards in some way. +If this is because of errors, quirks and fix-ups may be necessary, but will +be avoided if possible. If there are features missing from ACPI that preclude +it from being used on a platform, ECRs (Engineering Change Requests) should be +submitted to ASWG and go through the normal approval process; for those that +are not UEFI members, many other members of the Linux community are and would +likely be willing to assist in submitting ECRs. + + +Linux Code +---------- +Individual items specific to Linux on ARM, contained in the the Linux +source code, are in the list that follows: + +ACPI_OS_NAME + This macro defines the string to be returned when + an ACPI method invokes the _OS method. On ARM64 + systems, this macro will be "Linux" by default. + The command line parameter acpi_os= + can be used to set it to some other value. The + default value for other architectures is "Microsoft + Windows NT", for example. + +ACPI Objects +------------ +Detailed expectations for ACPI tables and object are listed in the file +Documentation/arm64/acpi_object_usage.rst. + + +References +---------- +[0] http://silver.arm.com + document ARM-DEN-0029, or newer: + "Server Base System Architecture", version 2.3, dated 27 Mar 2014 + +[1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf + Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System + Software on ARM Platforms", dated 16 Aug 2014 + +[2] http://www.secretlab.ca/archives/151, + 10 Jan 2015, Copyright (c) 2015, + Linaro Ltd., written by Grant Likely. + +[3] AMD ACPI for Seattle platform documentation + http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf + + +[4] http://www.uefi.org/acpi + please see the link for the "ACPI _DSD Device + Property Registry Instructions" + +[5] http://www.uefi.org/acpi + please see the link for the "_DSD (Device + Specific Data) Implementation Guide" + +[6] Kernel code for the unified device + property interface can be found in + include/linux/property.h and drivers/base/property.c. + + +Authors +------- +- Al Stone +- Graeme Gregory +- Hanjun Guo + +- Grant Likely , for the "Why ACPI on ARM?" section diff --git a/Documentation/arm64/arm-acpi.txt b/Documentation/arm64/arm-acpi.txt deleted file mode 100644 index 1a74a041a443..000000000000 --- a/Documentation/arm64/arm-acpi.txt +++ /dev/null @@ -1,519 +0,0 @@ -ACPI on ARMv8 Servers ---------------------- -ACPI can be used for ARMv8 general purpose servers designed to follow -the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server -Base Boot Requirements) [1] specifications. Please note that the SBBR -can be retrieved simply by visiting [1], but the SBSA is currently only -available to those with an ARM login due to ARM IP licensing concerns. - -The ARMv8 kernel implements the reduced hardware model of ACPI version -5.1 or later. Links to the specification and all external documents -it refers to are managed by the UEFI Forum. The specification is -available at http://www.uefi.org/specifications and documents referenced -by the specification can be found via http://www.uefi.org/acpi. - -If an ARMv8 system does not meet the requirements of the SBSA and SBBR, -or cannot be described using the mechanisms defined in the required ACPI -specifications, then ACPI may not be a good fit for the hardware. - -While the documents mentioned above set out the requirements for building -industry-standard ARMv8 servers, they also apply to more than one operating -system. The purpose of this document is to describe the interaction between -ACPI and Linux only, on an ARMv8 system -- that is, what Linux expects of -ACPI and what ACPI can expect of Linux. - - -Why ACPI on ARM? ----------------- -Before examining the details of the interface between ACPI and Linux, it is -useful to understand why ACPI is being used. Several technologies already -exist in Linux for describing non-enumerable hardware, after all. In this -section we summarize a blog post [2] from Grant Likely that outlines the -reasoning behind ACPI on ARMv8 servers. Actually, we snitch a good portion -of the summary text almost directly, to be honest. - -The short form of the rationale for ACPI on ARM is: - --- ACPI’s byte code (AML) allows the platform to encode hardware behavior, - while DT explicitly does not support this. For hardware vendors, being - able to encode behavior is a key tool used in supporting operating - system releases on new hardware. - --- ACPI’s OSPM defines a power management model that constrains what the - platform is allowed to do into a specific model, while still providing - flexibility in hardware design. - --- In the enterprise server environment, ACPI has established bindings (such - as for RAS) which are currently used in production systems. DT does not. - Such bindings could be defined in DT at some point, but doing so means ARM - and x86 would end up using completely different code paths in both firmware - and the kernel. - --- Choosing a single interface to describe the abstraction between a platform - and an OS is important. Hardware vendors would not be required to implement - both DT and ACPI if they want to support multiple operating systems. And, - agreeing on a single interface instead of being fragmented into per OS - interfaces makes for better interoperability overall. - --- The new ACPI governance process works well and Linux is now at the same - table as hardware vendors and other OS vendors. In fact, there is no - longer any reason to feel that ACPI only belongs to Windows or that - Linux is in any way secondary to Microsoft in this arena. The move of - ACPI governance into the UEFI forum has significantly opened up the - specification development process, and currently, a large portion of the - changes being made to ACPI are being driven by Linux. - -Key to the use of ACPI is the support model. For servers in general, the -responsibility for hardware behaviour cannot solely be the domain of the -kernel, but rather must be split between the platform and the kernel, in -order to allow for orderly change over time. ACPI frees the OS from needing -to understand all the minute details of the hardware so that the OS doesn’t -need to be ported to each and every device individually. It allows the -hardware vendors to take responsibility for power management behaviour without -depending on an OS release cycle which is not under their control. - -ACPI is also important because hardware and OS vendors have already worked -out the mechanisms for supporting a general purpose computing ecosystem. The -infrastructure is in place, the bindings are in place, and the processes are -in place. DT does exactly what Linux needs it to when working with vertically -integrated devices, but there are no good processes for supporting what the -server vendors need. Linux could potentially get there with DT, but doing so -really just duplicates something that already works. ACPI already does what -the hardware vendors need, Microsoft won’t collaborate on DT, and hardware -vendors would still end up providing two completely separate firmware -interfaces -- one for Linux and one for Windows. - - -Kernel Compatibility --------------------- -One of the primary motivations for ACPI is standardization, and using that -to provide backward compatibility for Linux kernels. In the server market, -software and hardware are often used for long periods. ACPI allows the -kernel and firmware to agree on a consistent abstraction that can be -maintained over time, even as hardware or software change. As long as the -abstraction is supported, systems can be updated without necessarily having -to replace the kernel. - -When a Linux driver or subsystem is first implemented using ACPI, it by -definition ends up requiring a specific version of the ACPI specification --- it's baseline. ACPI firmware must continue to work, even though it may -not be optimal, with the earliest kernel version that first provides support -for that baseline version of ACPI. There may be a need for additional drivers, -but adding new functionality (e.g., CPU power management) should not break -older kernel versions. Further, ACPI firmware must also work with the most -recent version of the kernel. - - -Relationship with Device Tree ------------------------------ -ACPI support in drivers and subsystems for ARMv8 should never be mutually -exclusive with DT support at compile time. - -At boot time the kernel will only use one description method depending on -parameters passed from the boot loader (including kernel bootargs). - -Regardless of whether DT or ACPI is used, the kernel must always be capable -of booting with either scheme (in kernels with both schemes enabled at compile -time). - - -Booting using ACPI tables -------------------------- -The only defined method for passing ACPI tables to the kernel on ARMv8 -is via the UEFI system configuration table. Just so it is explicit, this -means that ACPI is only supported on platforms that boot via UEFI. - -When an ARMv8 system boots, it can either have DT information, ACPI tables, -or in some very unusual cases, both. If no command line parameters are used, -the kernel will try to use DT for device enumeration; if there is no DT -present, the kernel will try to use ACPI tables, but only if they are present. -In neither is available, the kernel will not boot. If acpi=force is used -on the command line, the kernel will attempt to use ACPI tables first, but -fall back to DT if there are no ACPI tables present. The basic idea is that -the kernel will not fail to boot unless it absolutely has no other choice. - -Processing of ACPI tables may be disabled by passing acpi=off on the kernel -command line; this is the default behavior. - -In order for the kernel to load and use ACPI tables, the UEFI implementation -MUST set the ACPI_20_TABLE_GUID to point to the RSDP table (the table with -the ACPI signature "RSD PTR "). If this pointer is incorrect and acpi=force -is used, the kernel will disable ACPI and try to use DT to boot instead; the -kernel has, in effect, determined that ACPI tables are not present at that -point. - -If the pointer to the RSDP table is correct, the table will be mapped into -the kernel by the ACPI core, using the address provided by UEFI. - -The ACPI core will then locate and map in all other ACPI tables provided by -using the addresses in the RSDP table to find the XSDT (eXtended System -Description Table). The XSDT in turn provides the addresses to all other -ACPI tables provided by the system firmware; the ACPI core will then traverse -this table and map in the tables listed. - -The ACPI core will ignore any provided RSDT (Root System Description Table). -RSDTs have been deprecated and are ignored on arm64 since they only allow -for 32-bit addresses. - -Further, the ACPI core will only use the 64-bit address fields in the FADT -(Fixed ACPI Description Table). Any 32-bit address fields in the FADT will -be ignored on arm64. - -Hardware reduced mode (see Section 4.1 of the ACPI 6.1 specification) will -be enforced by the ACPI core on arm64. Doing so allows the ACPI core to -run less complex code since it no longer has to provide support for legacy -hardware from other architectures. Any fields that are not to be used for -hardware reduced mode must be set to zero. - -For the ACPI core to operate properly, and in turn provide the information -the kernel needs to configure devices, it expects to find the following -tables (all section numbers refer to the ACPI 6.1 specification): - - -- RSDP (Root System Description Pointer), section 5.2.5 - - -- XSDT (eXtended System Description Table), section 5.2.8 - - -- FADT (Fixed ACPI Description Table), section 5.2.9 - - -- DSDT (Differentiated System Description Table), section - 5.2.11.1 - - -- MADT (Multiple APIC Description Table), section 5.2.12 - - -- GTDT (Generic Timer Description Table), section 5.2.24 - - -- If PCI is supported, the MCFG (Memory mapped ConFiGuration - Table), section 5.2.6, specifically Table 5-31. - - -- If booting without a console= kernel parameter is - supported, the SPCR (Serial Port Console Redirection table), - section 5.2.6, specifically Table 5-31. - - -- If necessary to describe the I/O topology, SMMUs and GIC ITSs, - the IORT (Input Output Remapping Table, section 5.2.6, specifically - Table 5-31). - - -- If NUMA is supported, the SRAT (System Resource Affinity Table) - and SLIT (System Locality distance Information Table), sections - 5.2.16 and 5.2.17, respectively. - -If the above tables are not all present, the kernel may or may not be -able to boot properly since it may not be able to configure all of the -devices available. This list of tables is not meant to be all inclusive; -in some environments other tables may be needed (e.g., any of the APEI -tables from section 18) to support specific functionality. - - -ACPI Detection --------------- -Drivers should determine their probe() type by checking for a null -value for ACPI_HANDLE, or checking .of_node, or other information in -the device structure. This is detailed further in the "Driver -Recommendations" section. - -In non-driver code, if the presence of ACPI needs to be detected at -run time, then check the value of acpi_disabled. If CONFIG_ACPI is not -set, acpi_disabled will always be 1. - - -Device Enumeration ------------------- -Device descriptions in ACPI should use standard recognized ACPI interfaces. -These may contain less information than is typically provided via a Device -Tree description for the same device. This is also one of the reasons that -ACPI can be useful -- the driver takes into account that it may have less -detailed information about the device and uses sensible defaults instead. -If done properly in the driver, the hardware can change and improve over -time without the driver having to change at all. - -Clocks provide an excellent example. In DT, clocks need to be specified -and the drivers need to take them into account. In ACPI, the assumption -is that UEFI will leave the device in a reasonable default state, including -any clock settings. If for some reason the driver needs to change a clock -value, this can be done in an ACPI method; all the driver needs to do is -invoke the method and not concern itself with what the method needs to do -to change the clock. Changing the hardware can then take place over time -by changing what the ACPI method does, and not the driver. - -In DT, the parameters needed by the driver to set up clocks as in the example -above are known as "bindings"; in ACPI, these are known as "Device Properties" -and provided to a driver via the _DSD object. - -ACPI tables are described with a formal language called ASL, the ACPI -Source Language (section 19 of the specification). This means that there -are always multiple ways to describe the same thing -- including device -properties. For example, device properties could use an ASL construct -that looks like this: Name(KEY0, "value0"). An ACPI device driver would -then retrieve the value of the property by evaluating the KEY0 object. -However, using Name() this way has multiple problems: (1) ACPI limits -names ("KEY0") to four characters unlike DT; (2) there is no industry -wide registry that maintains a list of names, minimizing re-use; (3) -there is also no registry for the definition of property values ("value0"), -again making re-use difficult; and (4) how does one maintain backward -compatibility as new hardware comes out? The _DSD method was created -to solve precisely these sorts of problems; Linux drivers should ALWAYS -use the _DSD method for device properties and nothing else. - -The _DSM object (ACPI Section 9.14.1) could also be used for conveying -device properties to a driver. Linux drivers should only expect it to -be used if _DSD cannot represent the data required, and there is no way -to create a new UUID for the _DSD object. Note that there is even less -regulation of the use of _DSM than there is of _DSD. Drivers that depend -on the contents of _DSM objects will be more difficult to maintain over -time because of this; as of this writing, the use of _DSM is the cause -of quite a few firmware problems and is not recommended. - -Drivers should look for device properties in the _DSD object ONLY; the _DSD -object is described in the ACPI specification section 6.2.5, but this only -describes how to define the structure of an object returned via _DSD, and -how specific data structures are defined by specific UUIDs. Linux should -only use the _DSD Device Properties UUID [5]: - - -- UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 - - -- http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf - -The UEFI Forum provides a mechanism for registering device properties [4] -so that they may be used across all operating systems supporting ACPI. -Device properties that have not been registered with the UEFI Forum should -not be used. - -Before creating new device properties, check to be sure that they have not -been defined before and either registered in the Linux kernel documentation -as DT bindings, or the UEFI Forum as device properties. While we do not want -to simply move all DT bindings into ACPI device properties, we can learn from -what has been previously defined. - -If it is necessary to define a new device property, or if it makes sense to -synthesize the definition of a binding so it can be used in any firmware, -both DT bindings and ACPI device properties for device drivers have review -processes. Use them both. When the driver itself is submitted for review -to the Linux mailing lists, the device property definitions needed must be -submitted at the same time. A driver that supports ACPI and uses device -properties will not be considered complete without their definitions. Once -the device property has been accepted by the Linux community, it must be -registered with the UEFI Forum [4], which will review it again for consistency -within the registry. This may require iteration. The UEFI Forum, though, -will always be the canonical site for device property definitions. - -It may make sense to provide notice to the UEFI Forum that there is the -intent to register a previously unused device property name as a means of -reserving the name for later use. Other operating system vendors will -also be submitting registration requests and this may help smooth the -process. - -Once registration and review have been completed, the kernel provides an -interface for looking up device properties in a manner independent of -whether DT or ACPI is being used. This API should be used [6]; it can -eliminate some duplication of code paths in driver probing functions and -discourage divergence between DT bindings and ACPI device properties. - - -Programmable Power Control Resources ------------------------------------- -Programmable power control resources include such resources as voltage/current -providers (regulators) and clock sources. - -With ACPI, the kernel clock and regulator framework is not expected to be used -at all. - -The kernel assumes that power control of these resources is represented with -Power Resource Objects (ACPI section 7.1). The ACPI core will then handle -correctly enabling and disabling resources as they are needed. In order to -get that to work, ACPI assumes each device has defined D-states and that these -can be controlled through the optional ACPI methods _PS0, _PS1, _PS2, and _PS3; -in ACPI, _PS0 is the method to invoke to turn a device full on, and _PS3 is for -turning a device full off. - -There are two options for using those Power Resources. They can: - - -- be managed in a _PSx method which gets called on entry to power - state Dx. - - -- be declared separately as power resources with their own _ON and _OFF - methods. They are then tied back to D-states for a particular device - via _PRx which specifies which power resources a device needs to be on - while in Dx. Kernel then tracks number of devices using a power resource - and calls _ON/_OFF as needed. - -The kernel ACPI code will also assume that the _PSx methods follow the normal -ACPI rules for such methods: - - -- If either _PS0 or _PS3 is implemented, then the other method must also - be implemented. - - -- If a device requires usage or setup of a power resource when on, the ASL - should organize that it is allocated/enabled using the _PS0 method. - - -- Resources allocated or enabled in the _PS0 method should be disabled - or de-allocated in the _PS3 method. - - -- Firmware will leave the resources in a reasonable state before handing - over control to the kernel. - -Such code in _PSx methods will of course be very platform specific. But, -this allows the driver to abstract out the interface for operating the device -and avoid having to read special non-standard values from ACPI tables. Further, -abstracting the use of these resources allows the hardware to change over time -without requiring updates to the driver. - - -Clocks ------- -ACPI makes the assumption that clocks are initialized by the firmware -- -UEFI, in this case -- to some working value before control is handed over -to the kernel. This has implications for devices such as UARTs, or SoC-driven -LCD displays, for example. - -When the kernel boots, the clocks are assumed to be set to reasonable -working values. If for some reason the frequency needs to change -- e.g., -throttling for power management -- the device driver should expect that -process to be abstracted out into some ACPI method that can be invoked -(please see the ACPI specification for further recommendations on standard -methods to be expected). The only exceptions to this are CPU clocks where -CPPC provides a much richer interface than ACPI methods. If the clocks -are not set, there is no direct way for Linux to control them. - -If an SoC vendor wants to provide fine-grained control of the system clocks, -they could do so by providing ACPI methods that could be invoked by Linux -drivers. However, this is NOT recommended and Linux drivers should NOT use -such methods, even if they are provided. Such methods are not currently -standardized in the ACPI specification, and using them could tie a kernel -to a very specific SoC, or tie an SoC to a very specific version of the -kernel, both of which we are trying to avoid. - - -Driver Recommendations ----------------------- -DO NOT remove any DT handling when adding ACPI support for a driver. The -same device may be used on many different systems. - -DO try to structure the driver so that it is data-driven. That is, set up -a struct containing internal per-device state based on defaults and whatever -else must be discovered by the driver probe function. Then, have the rest -of the driver operate off of the contents of that struct. Doing so should -allow most divergence between ACPI and DT functionality to be kept local to -the probe function instead of being scattered throughout the driver. For -example: - -static int device_probe_dt(struct platform_device *pdev) -{ - /* DT specific functionality */ - ... -} - -static int device_probe_acpi(struct platform_device *pdev) -{ - /* ACPI specific functionality */ - ... -} - -static int device_probe(struct platform_device *pdev) -{ - ... - struct device_node node = pdev->dev.of_node; - ... - - if (node) - ret = device_probe_dt(pdev); - else if (ACPI_HANDLE(&pdev->dev)) - ret = device_probe_acpi(pdev); - else - /* other initialization */ - ... - /* Continue with any generic probe operations */ - ... -} - -DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it -clear the different names the driver is probed for, both from DT and from -ACPI: - -static struct of_device_id virtio_mmio_match[] = { - { .compatible = "virtio,mmio", }, - { } -}; -MODULE_DEVICE_TABLE(of, virtio_mmio_match); - -static const struct acpi_device_id virtio_mmio_acpi_match[] = { - { "LNRO0005", }, - { } -}; -MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); - - -ASWG ----- -The ACPI specification changes regularly. During the year 2014, for instance, -version 5.1 was released and version 6.0 substantially completed, with most of -the changes being driven by ARM-specific requirements. Proposed changes are -presented and discussed in the ASWG (ACPI Specification Working Group) which -is a part of the UEFI Forum. The current version of the ACPI specification -is 6.1 release in January 2016. - -Participation in this group is open to all UEFI members. Please see -http://www.uefi.org/workinggroup for details on group membership. - -It is the intent of the ARMv8 ACPI kernel code to follow the ACPI specification -as closely as possible, and to only implement functionality that complies with -the released standards from UEFI ASWG. As a practical matter, there will be -vendors that provide bad ACPI tables or violate the standards in some way. -If this is because of errors, quirks and fix-ups may be necessary, but will -be avoided if possible. If there are features missing from ACPI that preclude -it from being used on a platform, ECRs (Engineering Change Requests) should be -submitted to ASWG and go through the normal approval process; for those that -are not UEFI members, many other members of the Linux community are and would -likely be willing to assist in submitting ECRs. - - -Linux Code ----------- -Individual items specific to Linux on ARM, contained in the the Linux -source code, are in the list that follows: - -ACPI_OS_NAME This macro defines the string to be returned when - an ACPI method invokes the _OS method. On ARM64 - systems, this macro will be "Linux" by default. - The command line parameter acpi_os= - can be used to set it to some other value. The - default value for other architectures is "Microsoft - Windows NT", for example. - -ACPI Objects ------------- -Detailed expectations for ACPI tables and object are listed in the file -Documentation/arm64/acpi_object_usage.txt. - - -References ----------- -[0] http://silver.arm.com -- document ARM-DEN-0029, or newer - "Server Base System Architecture", version 2.3, dated 27 Mar 2014 - -[1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf - Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System - Software on ARM Platforms", dated 16 Aug 2014 - -[2] http://www.secretlab.ca/archives/151, 10 Jan 2015, Copyright (c) 2015, - Linaro Ltd., written by Grant Likely. - -[3] AMD ACPI for Seattle platform documentation: - http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf - -[4] http://www.uefi.org/acpi -- please see the link for the "ACPI _DSD Device - Property Registry Instructions" - -[5] http://www.uefi.org/acpi -- please see the link for the "_DSD (Device - Specific Data) Implementation Guide" - -[6] Kernel code for the unified device property interface can be found in - include/linux/property.h and drivers/base/property.c. - - -Authors -------- -Al Stone -Graeme Gregory -Hanjun Guo - -Grant Likely , for the "Why ACPI on ARM?" section diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst new file mode 100644 index 000000000000..3d041d0d16e8 --- /dev/null +++ b/Documentation/arm64/booting.rst @@ -0,0 +1,293 @@ +===================== +Booting AArch64 Linux +===================== + +Author: Will Deacon + +Date : 07 September 2012 + +This document is based on the ARM booting document by Russell King and +is relevant to all public releases of the AArch64 Linux kernel. + +The AArch64 exception model is made up of a number of exception levels +(EL0 - EL3), with EL0 and EL1 having a secure and a non-secure +counterpart. EL2 is the hypervisor level and exists only in non-secure +mode. EL3 is the highest priority level and exists only in secure mode. + +For the purposes of this document, we will use the term `boot loader` +simply to define all software that executes on the CPU(s) before control +is passed to the Linux kernel. This may include secure monitor and +hypervisor code, or it may just be a handful of instructions for +preparing a minimal boot environment. + +Essentially, the boot loader should provide (as a minimum) the +following: + +1. Setup and initialise the RAM +2. Setup the device tree +3. Decompress the kernel image +4. Call the kernel image + + +1. Setup and initialise RAM +--------------------------- + +Requirement: MANDATORY + +The boot loader is expected to find and initialise all RAM that the +kernel will use for volatile data storage in the system. It performs +this in a machine dependent manner. (It may use internal algorithms +to automatically locate and size all RAM, or it may use knowledge of +the RAM in the machine, or any other method the boot loader designer +sees fit.) + + +2. Setup the device tree +------------------------- + +Requirement: MANDATORY + +The device tree blob (dtb) must be placed on an 8-byte boundary and must +not exceed 2 megabytes in size. Since the dtb will be mapped cacheable +using blocks of up to 2 megabytes in size, it must not be placed within +any 2M region which must be mapped with any specific attributes. + +NOTE: versions prior to v4.2 also require that the DTB be placed within +the 512 MB region starting at text_offset bytes below the kernel Image. + +3. Decompress the kernel image +------------------------------ + +Requirement: OPTIONAL + +The AArch64 kernel does not currently provide a decompressor and +therefore requires decompression (gzip etc.) to be performed by the boot +loader if a compressed Image target (e.g. Image.gz) is used. For +bootloaders that do not implement this requirement, the uncompressed +Image target is available instead. + + +4. Call the kernel image +------------------------ + +Requirement: MANDATORY + +The decompressed kernel image contains a 64-byte header as follows:: + + u32 code0; /* Executable code */ + u32 code1; /* Executable code */ + u64 text_offset; /* Image load offset, little endian */ + u64 image_size; /* Effective Image size, little endian */ + u64 flags; /* kernel flags, little endian */ + u64 res2 = 0; /* reserved */ + u64 res3 = 0; /* reserved */ + u64 res4 = 0; /* reserved */ + u32 magic = 0x644d5241; /* Magic number, little endian, "ARM\x64" */ + u32 res5; /* reserved (used for PE COFF offset) */ + + +Header notes: + +- As of v3.17, all fields are little endian unless stated otherwise. + +- code0/code1 are responsible for branching to stext. + +- when booting through EFI, code0/code1 are initially skipped. + res5 is an offset to the PE header and the PE header has the EFI + entry point (efi_stub_entry). When the stub has done its work, it + jumps to code0 to resume the normal boot process. + +- Prior to v3.17, the endianness of text_offset was not specified. In + these cases image_size is zero and text_offset is 0x80000 in the + endianness of the kernel. Where image_size is non-zero image_size is + little-endian and must be respected. Where image_size is zero, + text_offset can be assumed to be 0x80000. + +- The flags field (introduced in v3.17) is a little-endian 64-bit field + composed as follows: + + ============= =============================================================== + Bit 0 Kernel endianness. 1 if BE, 0 if LE. + Bit 1-2 Kernel Page size. + + * 0 - Unspecified. + * 1 - 4K + * 2 - 16K + * 3 - 64K + Bit 3 Kernel physical placement + + 0 + 2MB aligned base should be as close as possible + to the base of DRAM, since memory below it is not + accessible via the linear mapping + 1 + 2MB aligned base may be anywhere in physical + memory + Bits 4-63 Reserved. + ============= =============================================================== + +- When image_size is zero, a bootloader should attempt to keep as much + memory as possible free for use by the kernel immediately after the + end of the kernel image. The amount of space required will vary + depending on selected features, and is effectively unbound. + +The Image must be placed text_offset bytes from a 2MB aligned base +address anywhere in usable system RAM and called there. The region +between the 2 MB aligned base address and the start of the image has no +special significance to the kernel, and may be used for other purposes. +At least image_size bytes from the start of the image must be free for +use by the kernel. +NOTE: versions prior to v4.6 cannot make use of memory below the +physical offset of the Image so it is recommended that the Image be +placed as close as possible to the start of system RAM. + +If an initrd/initramfs is passed to the kernel at boot, it must reside +entirely within a 1 GB aligned physical memory window of up to 32 GB in +size that fully covers the kernel Image as well. + +Any memory described to the kernel (even that below the start of the +image) which is not marked as reserved from the kernel (e.g., with a +memreserve region in the device tree) will be considered as available to +the kernel. + +Before jumping into the kernel, the following conditions must be met: + +- Quiesce all DMA capable devices so that memory does not get + corrupted by bogus network packets or disk data. This will save + you many hours of debug. + +- Primary CPU general-purpose register settings: + + - x0 = physical address of device tree blob (dtb) in system RAM. + - x1 = 0 (reserved for future use) + - x2 = 0 (reserved for future use) + - x3 = 0 (reserved for future use) + +- CPU mode + + All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError, + IRQ and FIQ). + The CPU must be in either EL2 (RECOMMENDED in order to have access to + the virtualisation extensions) or non-secure EL1. + +- Caches, MMUs + + The MMU must be off. + Instruction cache may be on or off. + The address range corresponding to the loaded kernel image must be + cleaned to the PoC. In the presence of a system cache or other + coherent masters with caches enabled, this will typically require + cache maintenance by VA rather than set/way operations. + System caches which respect the architected cache maintenance by VA + operations must be configured and may be enabled. + System caches which do not respect architected cache maintenance by VA + operations (not recommended) must be configured and disabled. + +- Architected timers + + CNTFRQ must be programmed with the timer frequency and CNTVOFF must + be programmed with a consistent value on all CPUs. If entering the + kernel at EL1, CNTHCTL_EL2 must have EL1PCTEN (bit 0) set where + available. + +- Coherency + + All CPUs to be booted by the kernel must be part of the same coherency + domain on entry to the kernel. This may require IMPLEMENTATION DEFINED + initialisation to enable the receiving of maintenance operations on + each CPU. + +- System registers + + All writable architected system registers at the exception level where + the kernel image will be entered must be initialised by software at a + higher exception level to prevent execution in an UNKNOWN state. + + - SCR_EL3.FIQ must have the same value across all CPUs the kernel is + executing on. + - The value of SCR_EL3.FIQ must be the same as the one present at boot + time whenever the kernel is executing. + + For systems with a GICv3 interrupt controller to be used in v3 mode: + - If EL3 is present: + + - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1. + - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1. + + - If the kernel is entered at EL1: + + - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1 + - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1. + + - The DT or ACPI tables must describe a GICv3 interrupt controller. + + For systems with a GICv3 interrupt controller to be used in + compatibility (v2) mode: + + - If EL3 is present: + + ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0. + + - If the kernel is entered at EL1: + + ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0. + + - The DT or ACPI tables must describe a GICv2 interrupt controller. + + For CPUs with pointer authentication functionality: + - If EL3 is present: + + - SCR_EL3.APK (bit 16) must be initialised to 0b1 + - SCR_EL3.API (bit 17) must be initialised to 0b1 + + - If the kernel is entered at EL1: + + - HCR_EL2.APK (bit 40) must be initialised to 0b1 + - HCR_EL2.API (bit 41) must be initialised to 0b1 + +The requirements described above for CPU mode, caches, MMUs, architected +timers, coherency and system registers apply to all CPUs. All CPUs must +enter the kernel in the same exception level. + +The boot loader is expected to enter the kernel on each CPU in the +following manner: + +- The primary CPU must jump directly to the first instruction of the + kernel image. The device tree blob passed by this CPU must contain + an 'enable-method' property for each cpu node. The supported + enable-methods are described below. + + It is expected that the bootloader will generate these device tree + properties and insert them into the blob prior to kernel entry. + +- CPUs with a "spin-table" enable-method must have a 'cpu-release-addr' + property in their cpu node. This property identifies a + naturally-aligned 64-bit zero-initalised memory location. + + These CPUs should spin outside of the kernel in a reserved area of + memory (communicated to the kernel by a /memreserve/ region in the + device tree) polling their cpu-release-addr location, which must be + contained in the reserved region. A wfe instruction may be inserted + to reduce the overhead of the busy-loop and a sev will be issued by + the primary CPU. When a read of the location pointed to by the + cpu-release-addr returns a non-zero value, the CPU must jump to this + value. The value will be written as a single 64-bit little-endian + value, so CPUs must convert the read value to their native endianness + before jumping to it. + +- CPUs with a "psci" enable method should remain outside of + the kernel (i.e. outside of the regions of memory described to the + kernel in the memory node, or in a reserved area of memory described + to the kernel by a /memreserve/ region in the device tree). The + kernel will issue CPU_ON calls as described in ARM document number ARM + DEN 0022A ("Power State Coordination Interface System Software on ARM + processors") to bring CPUs into the kernel. + + The device tree should contain a 'psci' node, as described in + Documentation/devicetree/bindings/arm/psci.txt. + +- Secondary CPU general-purpose register settings + x0 = 0 (reserved for future use) + x1 = 0 (reserved for future use) + x2 = 0 (reserved for future use) + x3 = 0 (reserved for future use) diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt deleted file mode 100644 index fbab7e21d116..000000000000 --- a/Documentation/arm64/booting.txt +++ /dev/null @@ -1,266 +0,0 @@ - Booting AArch64 Linux - ===================== - -Author: Will Deacon -Date : 07 September 2012 - -This document is based on the ARM booting document by Russell King and -is relevant to all public releases of the AArch64 Linux kernel. - -The AArch64 exception model is made up of a number of exception levels -(EL0 - EL3), with EL0 and EL1 having a secure and a non-secure -counterpart. EL2 is the hypervisor level and exists only in non-secure -mode. EL3 is the highest priority level and exists only in secure mode. - -For the purposes of this document, we will use the term `boot loader' -simply to define all software that executes on the CPU(s) before control -is passed to the Linux kernel. This may include secure monitor and -hypervisor code, or it may just be a handful of instructions for -preparing a minimal boot environment. - -Essentially, the boot loader should provide (as a minimum) the -following: - -1. Setup and initialise the RAM -2. Setup the device tree -3. Decompress the kernel image -4. Call the kernel image - - -1. Setup and initialise RAM ---------------------------- - -Requirement: MANDATORY - -The boot loader is expected to find and initialise all RAM that the -kernel will use for volatile data storage in the system. It performs -this in a machine dependent manner. (It may use internal algorithms -to automatically locate and size all RAM, or it may use knowledge of -the RAM in the machine, or any other method the boot loader designer -sees fit.) - - -2. Setup the device tree -------------------------- - -Requirement: MANDATORY - -The device tree blob (dtb) must be placed on an 8-byte boundary and must -not exceed 2 megabytes in size. Since the dtb will be mapped cacheable -using blocks of up to 2 megabytes in size, it must not be placed within -any 2M region which must be mapped with any specific attributes. - -NOTE: versions prior to v4.2 also require that the DTB be placed within -the 512 MB region starting at text_offset bytes below the kernel Image. - -3. Decompress the kernel image ------------------------------- - -Requirement: OPTIONAL - -The AArch64 kernel does not currently provide a decompressor and -therefore requires decompression (gzip etc.) to be performed by the boot -loader if a compressed Image target (e.g. Image.gz) is used. For -bootloaders that do not implement this requirement, the uncompressed -Image target is available instead. - - -4. Call the kernel image ------------------------- - -Requirement: MANDATORY - -The decompressed kernel image contains a 64-byte header as follows: - - u32 code0; /* Executable code */ - u32 code1; /* Executable code */ - u64 text_offset; /* Image load offset, little endian */ - u64 image_size; /* Effective Image size, little endian */ - u64 flags; /* kernel flags, little endian */ - u64 res2 = 0; /* reserved */ - u64 res3 = 0; /* reserved */ - u64 res4 = 0; /* reserved */ - u32 magic = 0x644d5241; /* Magic number, little endian, "ARM\x64" */ - u32 res5; /* reserved (used for PE COFF offset) */ - - -Header notes: - -- As of v3.17, all fields are little endian unless stated otherwise. - -- code0/code1 are responsible for branching to stext. - -- when booting through EFI, code0/code1 are initially skipped. - res5 is an offset to the PE header and the PE header has the EFI - entry point (efi_stub_entry). When the stub has done its work, it - jumps to code0 to resume the normal boot process. - -- Prior to v3.17, the endianness of text_offset was not specified. In - these cases image_size is zero and text_offset is 0x80000 in the - endianness of the kernel. Where image_size is non-zero image_size is - little-endian and must be respected. Where image_size is zero, - text_offset can be assumed to be 0x80000. - -- The flags field (introduced in v3.17) is a little-endian 64-bit field - composed as follows: - Bit 0: Kernel endianness. 1 if BE, 0 if LE. - Bit 1-2: Kernel Page size. - 0 - Unspecified. - 1 - 4K - 2 - 16K - 3 - 64K - Bit 3: Kernel physical placement - 0 - 2MB aligned base should be as close as possible - to the base of DRAM, since memory below it is not - accessible via the linear mapping - 1 - 2MB aligned base may be anywhere in physical - memory - Bits 4-63: Reserved. - -- When image_size is zero, a bootloader should attempt to keep as much - memory as possible free for use by the kernel immediately after the - end of the kernel image. The amount of space required will vary - depending on selected features, and is effectively unbound. - -The Image must be placed text_offset bytes from a 2MB aligned base -address anywhere in usable system RAM and called there. The region -between the 2 MB aligned base address and the start of the image has no -special significance to the kernel, and may be used for other purposes. -At least image_size bytes from the start of the image must be free for -use by the kernel. -NOTE: versions prior to v4.6 cannot make use of memory below the -physical offset of the Image so it is recommended that the Image be -placed as close as possible to the start of system RAM. - -If an initrd/initramfs is passed to the kernel at boot, it must reside -entirely within a 1 GB aligned physical memory window of up to 32 GB in -size that fully covers the kernel Image as well. - -Any memory described to the kernel (even that below the start of the -image) which is not marked as reserved from the kernel (e.g., with a -memreserve region in the device tree) will be considered as available to -the kernel. - -Before jumping into the kernel, the following conditions must be met: - -- Quiesce all DMA capable devices so that memory does not get - corrupted by bogus network packets or disk data. This will save - you many hours of debug. - -- Primary CPU general-purpose register settings - x0 = physical address of device tree blob (dtb) in system RAM. - x1 = 0 (reserved for future use) - x2 = 0 (reserved for future use) - x3 = 0 (reserved for future use) - -- CPU mode - All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError, - IRQ and FIQ). - The CPU must be in either EL2 (RECOMMENDED in order to have access to - the virtualisation extensions) or non-secure EL1. - -- Caches, MMUs - The MMU must be off. - Instruction cache may be on or off. - The address range corresponding to the loaded kernel image must be - cleaned to the PoC. In the presence of a system cache or other - coherent masters with caches enabled, this will typically require - cache maintenance by VA rather than set/way operations. - System caches which respect the architected cache maintenance by VA - operations must be configured and may be enabled. - System caches which do not respect architected cache maintenance by VA - operations (not recommended) must be configured and disabled. - -- Architected timers - CNTFRQ must be programmed with the timer frequency and CNTVOFF must - be programmed with a consistent value on all CPUs. If entering the - kernel at EL1, CNTHCTL_EL2 must have EL1PCTEN (bit 0) set where - available. - -- Coherency - All CPUs to be booted by the kernel must be part of the same coherency - domain on entry to the kernel. This may require IMPLEMENTATION DEFINED - initialisation to enable the receiving of maintenance operations on - each CPU. - -- System registers - All writable architected system registers at the exception level where - the kernel image will be entered must be initialised by software at a - higher exception level to prevent execution in an UNKNOWN state. - - - SCR_EL3.FIQ must have the same value across all CPUs the kernel is - executing on. - - The value of SCR_EL3.FIQ must be the same as the one present at boot - time whenever the kernel is executing. - - For systems with a GICv3 interrupt controller to be used in v3 mode: - - If EL3 is present: - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1. - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1. - - If the kernel is entered at EL1: - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1 - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1. - - The DT or ACPI tables must describe a GICv3 interrupt controller. - - For systems with a GICv3 interrupt controller to be used in - compatibility (v2) mode: - - If EL3 is present: - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0. - - If the kernel is entered at EL1: - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0. - - The DT or ACPI tables must describe a GICv2 interrupt controller. - - For CPUs with pointer authentication functionality: - - If EL3 is present: - SCR_EL3.APK (bit 16) must be initialised to 0b1 - SCR_EL3.API (bit 17) must be initialised to 0b1 - - If the kernel is entered at EL1: - HCR_EL2.APK (bit 40) must be initialised to 0b1 - HCR_EL2.API (bit 41) must be initialised to 0b1 - -The requirements described above for CPU mode, caches, MMUs, architected -timers, coherency and system registers apply to all CPUs. All CPUs must -enter the kernel in the same exception level. - -The boot loader is expected to enter the kernel on each CPU in the -following manner: - -- The primary CPU must jump directly to the first instruction of the - kernel image. The device tree blob passed by this CPU must contain - an 'enable-method' property for each cpu node. The supported - enable-methods are described below. - - It is expected that the bootloader will generate these device tree - properties and insert them into the blob prior to kernel entry. - -- CPUs with a "spin-table" enable-method must have a 'cpu-release-addr' - property in their cpu node. This property identifies a - naturally-aligned 64-bit zero-initalised memory location. - - These CPUs should spin outside of the kernel in a reserved area of - memory (communicated to the kernel by a /memreserve/ region in the - device tree) polling their cpu-release-addr location, which must be - contained in the reserved region. A wfe instruction may be inserted - to reduce the overhead of the busy-loop and a sev will be issued by - the primary CPU. When a read of the location pointed to by the - cpu-release-addr returns a non-zero value, the CPU must jump to this - value. The value will be written as a single 64-bit little-endian - value, so CPUs must convert the read value to their native endianness - before jumping to it. - -- CPUs with a "psci" enable method should remain outside of - the kernel (i.e. outside of the regions of memory described to the - kernel in the memory node, or in a reserved area of memory described - to the kernel by a /memreserve/ region in the device tree). The - kernel will issue CPU_ON calls as described in ARM document number ARM - DEN 0022A ("Power State Coordination Interface System Software on ARM - processors") to bring CPUs into the kernel. - - The device tree should contain a 'psci' node, as described in - Documentation/devicetree/bindings/arm/psci.txt. - -- Secondary CPU general-purpose register settings - x0 = 0 (reserved for future use) - x1 = 0 (reserved for future use) - x2 = 0 (reserved for future use) - x3 = 0 (reserved for future use) diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst new file mode 100644 index 000000000000..2955287e9acc --- /dev/null +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -0,0 +1,304 @@ +=========================== +ARM64 CPU Feature Registers +=========================== + +Author: Suzuki K Poulose + + +This file describes the ABI for exporting the AArch64 CPU ID/feature +registers to userspace. The availability of this ABI is advertised +via the HWCAP_CPUID in HWCAPs. + +1. Motivation +------------- + +The ARM architecture defines a set of feature registers, which describe +the capabilities of the CPU/system. Access to these system registers is +restricted from EL0 and there is no reliable way for an application to +extract this information to make better decisions at runtime. There is +limited information available to the application via HWCAPs, however +there are some issues with their usage. + + a) Any change to the HWCAPs requires an update to userspace (e.g libc) + to detect the new changes, which can take a long time to appear in + distributions. Exposing the registers allows applications to get the + information without requiring updates to the toolchains. + + b) Access to HWCAPs is sometimes limited (e.g prior to libc, or + when ld is initialised at startup time). + + c) HWCAPs cannot represent non-boolean information effectively. The + architecture defines a canonical format for representing features + in the ID registers; this is well defined and is capable of + representing all valid architecture variations. + + +2. Requirements +--------------- + + a) Safety: + + Applications should be able to use the information provided by the + infrastructure to run safely across the system. This has greater + implications on a system with heterogeneous CPUs. + The infrastructure exports a value that is safe across all the + available CPU on the system. + + e.g, If at least one CPU doesn't implement CRC32 instructions, while + others do, we should report that the CRC32 is not implemented. + Otherwise an application could crash when scheduled on the CPU + which doesn't support CRC32. + + b) Security: + + Applications should only be able to receive information that is + relevant to the normal operation in userspace. Hence, some of the + fields are masked out(i.e, made invisible) and their values are set to + indicate the feature is 'not supported'. See Section 4 for the list + of visible features. Also, the kernel may manipulate the fields + based on what it supports. e.g, If FP is not supported by the + kernel, the values could indicate that the FP is not available + (even when the CPU provides it). + + c) Implementation Defined Features + + The infrastructure doesn't expose any register which is + IMPLEMENTATION DEFINED as per ARMv8-A Architecture. + + d) CPU Identification: + + MIDR_EL1 is exposed to help identify the processor. On a + heterogeneous system, this could be racy (just like getcpu()). The + process could be migrated to another CPU by the time it uses the + register value, unless the CPU affinity is set. Hence, there is no + guarantee that the value reflects the processor that it is + currently executing on. The REVIDR is not exposed due to this + constraint, as REVIDR makes sense only in conjunction with the + MIDR. Alternately, MIDR_EL1 and REVIDR_EL1 are exposed via sysfs + at:: + + /sys/devices/system/cpu/cpu$ID/regs/identification/ + \- midr + \- revidr + +3. Implementation +-------------------- + +The infrastructure is built on the emulation of the 'MRS' instruction. +Accessing a restricted system register from an application generates an +exception and ends up in SIGILL being delivered to the process. +The infrastructure hooks into the exception handler and emulates the +operation if the source belongs to the supported system register space. + +The infrastructure emulates only the following system register space:: + + Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7 + +(See Table C5-6 'System instruction encodings for non-Debug System +register accesses' in ARMv8 ARM DDI 0487A.h, for the list of +registers). + +The following rules are applied to the value returned by the +infrastructure: + + a) The value of an 'IMPLEMENTATION DEFINED' field is set to 0. + b) The value of a reserved field is populated with the reserved + value as defined by the architecture. + c) The value of a 'visible' field holds the system wide safe value + for the particular feature (except for MIDR_EL1, see section 4). + d) All other fields (i.e, invisible fields) are set to indicate + the feature is missing (as defined by the architecture). + +4. List of registers with visible features +------------------------------------------- + + 1) ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | TS | [55-52] | y | + +------------------------------+---------+---------+ + | FHM | [51-48] | y | + +------------------------------+---------+---------+ + | DP | [47-44] | y | + +------------------------------+---------+---------+ + | SM4 | [43-40] | y | + +------------------------------+---------+---------+ + | SM3 | [39-36] | y | + +------------------------------+---------+---------+ + | SHA3 | [35-32] | y | + +------------------------------+---------+---------+ + | RDM | [31-28] | y | + +------------------------------+---------+---------+ + | ATOMICS | [23-20] | y | + +------------------------------+---------+---------+ + | CRC32 | [19-16] | y | + +------------------------------+---------+---------+ + | SHA2 | [15-12] | y | + +------------------------------+---------+---------+ + | SHA1 | [11-8] | y | + +------------------------------+---------+---------+ + | AES | [7-4] | y | + +------------------------------+---------+---------+ + + + 2) ID_AA64PFR0_EL1 - Processor Feature Register 0 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | DIT | [51-48] | y | + +------------------------------+---------+---------+ + | SVE | [35-32] | y | + +------------------------------+---------+---------+ + | GIC | [27-24] | n | + +------------------------------+---------+---------+ + | AdvSIMD | [23-20] | y | + +------------------------------+---------+---------+ + | FP | [19-16] | y | + +------------------------------+---------+---------+ + | EL3 | [15-12] | n | + +------------------------------+---------+---------+ + | EL2 | [11-8] | n | + +------------------------------+---------+---------+ + | EL1 | [7-4] | n | + +------------------------------+---------+---------+ + | EL0 | [3-0] | n | + +------------------------------+---------+---------+ + + + 3) MIDR_EL1 - Main ID Register + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | Implementer | [31-24] | y | + +------------------------------+---------+---------+ + | Variant | [23-20] | y | + +------------------------------+---------+---------+ + | Architecture | [19-16] | y | + +------------------------------+---------+---------+ + | PartNum | [15-4] | y | + +------------------------------+---------+---------+ + | Revision | [3-0] | y | + +------------------------------+---------+---------+ + + NOTE: The 'visible' fields of MIDR_EL1 will contain the value + as available on the CPU where it is fetched and is not a system + wide safe value. + + 4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | GPI | [31-28] | y | + +------------------------------+---------+---------+ + | GPA | [27-24] | y | + +------------------------------+---------+---------+ + | LRCPC | [23-20] | y | + +------------------------------+---------+---------+ + | FCMA | [19-16] | y | + +------------------------------+---------+---------+ + | JSCVT | [15-12] | y | + +------------------------------+---------+---------+ + | API | [11-8] | y | + +------------------------------+---------+---------+ + | APA | [7-4] | y | + +------------------------------+---------+---------+ + | DPB | [3-0] | y | + +------------------------------+---------+---------+ + + 5) ID_AA64MMFR2_EL1 - Memory model feature register 2 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | AT | [35-32] | y | + +------------------------------+---------+---------+ + + 6) ID_AA64ZFR0_EL1 - SVE feature ID register 0 + + +------------------------------+---------+---------+ + | Name | bits | visible | + +------------------------------+---------+---------+ + | SM4 | [43-40] | y | + +------------------------------+---------+---------+ + | SHA3 | [35-32] | y | + +------------------------------+---------+---------+ + | BitPerm | [19-16] | y | + +------------------------------+---------+---------+ + | AES | [7-4] | y | + +------------------------------+---------+---------+ + | SVEVer | [3-0] | y | + +------------------------------+---------+---------+ + +Appendix I: Example +------------------- + +:: + + /* + * Sample program to demonstrate the MRS emulation ABI. + * + * Copyright (C) 2015-2016, ARM Ltd + * + * Author: Suzuki K Poulose + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + + #include + #include + #include + + #define get_cpu_ftr(id) ({ \ + unsigned long __val; \ + asm("mrs %0, "#id : "=r" (__val)); \ + printf("%-20s: 0x%016lx\n", #id, __val); \ + }) + + int main(void) + { + + if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { + fputs("CPUID registers unavailable\n", stderr); + return 1; + } + + get_cpu_ftr(ID_AA64ISAR0_EL1); + get_cpu_ftr(ID_AA64ISAR1_EL1); + get_cpu_ftr(ID_AA64MMFR0_EL1); + get_cpu_ftr(ID_AA64MMFR1_EL1); + get_cpu_ftr(ID_AA64PFR0_EL1); + get_cpu_ftr(ID_AA64PFR1_EL1); + get_cpu_ftr(ID_AA64DFR0_EL1); + get_cpu_ftr(ID_AA64DFR1_EL1); + + get_cpu_ftr(MIDR_EL1); + get_cpu_ftr(MPIDR_EL1); + get_cpu_ftr(REVIDR_EL1); + + #if 0 + /* Unexposed register access causes SIGILL */ + get_cpu_ftr(ID_MMFR0_EL1); + #endif + + return 0; + } diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt deleted file mode 100644 index 684a0da39378..000000000000 --- a/Documentation/arm64/cpu-feature-registers.txt +++ /dev/null @@ -1,296 +0,0 @@ - ARM64 CPU Feature Registers - =========================== - -Author: Suzuki K Poulose - - -This file describes the ABI for exporting the AArch64 CPU ID/feature -registers to userspace. The availability of this ABI is advertised -via the HWCAP_CPUID in HWCAPs. - -1. Motivation ---------------- - -The ARM architecture defines a set of feature registers, which describe -the capabilities of the CPU/system. Access to these system registers is -restricted from EL0 and there is no reliable way for an application to -extract this information to make better decisions at runtime. There is -limited information available to the application via HWCAPs, however -there are some issues with their usage. - - a) Any change to the HWCAPs requires an update to userspace (e.g libc) - to detect the new changes, which can take a long time to appear in - distributions. Exposing the registers allows applications to get the - information without requiring updates to the toolchains. - - b) Access to HWCAPs is sometimes limited (e.g prior to libc, or - when ld is initialised at startup time). - - c) HWCAPs cannot represent non-boolean information effectively. The - architecture defines a canonical format for representing features - in the ID registers; this is well defined and is capable of - representing all valid architecture variations. - - -2. Requirements ------------------ - - a) Safety : - Applications should be able to use the information provided by the - infrastructure to run safely across the system. This has greater - implications on a system with heterogeneous CPUs. - The infrastructure exports a value that is safe across all the - available CPU on the system. - - e.g, If at least one CPU doesn't implement CRC32 instructions, while - others do, we should report that the CRC32 is not implemented. - Otherwise an application could crash when scheduled on the CPU - which doesn't support CRC32. - - b) Security : - Applications should only be able to receive information that is - relevant to the normal operation in userspace. Hence, some of the - fields are masked out(i.e, made invisible) and their values are set to - indicate the feature is 'not supported'. See Section 4 for the list - of visible features. Also, the kernel may manipulate the fields - based on what it supports. e.g, If FP is not supported by the - kernel, the values could indicate that the FP is not available - (even when the CPU provides it). - - c) Implementation Defined Features - The infrastructure doesn't expose any register which is - IMPLEMENTATION DEFINED as per ARMv8-A Architecture. - - d) CPU Identification : - MIDR_EL1 is exposed to help identify the processor. On a - heterogeneous system, this could be racy (just like getcpu()). The - process could be migrated to another CPU by the time it uses the - register value, unless the CPU affinity is set. Hence, there is no - guarantee that the value reflects the processor that it is - currently executing on. The REVIDR is not exposed due to this - constraint, as REVIDR makes sense only in conjunction with the - MIDR. Alternately, MIDR_EL1 and REVIDR_EL1 are exposed via sysfs - at: - - /sys/devices/system/cpu/cpu$ID/regs/identification/ - \- midr - \- revidr - -3. Implementation --------------------- - -The infrastructure is built on the emulation of the 'MRS' instruction. -Accessing a restricted system register from an application generates an -exception and ends up in SIGILL being delivered to the process. -The infrastructure hooks into the exception handler and emulates the -operation if the source belongs to the supported system register space. - -The infrastructure emulates only the following system register space: - Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7 - -(See Table C5-6 'System instruction encodings for non-Debug System -register accesses' in ARMv8 ARM DDI 0487A.h, for the list of -registers). - -The following rules are applied to the value returned by the -infrastructure: - - a) The value of an 'IMPLEMENTATION DEFINED' field is set to 0. - b) The value of a reserved field is populated with the reserved - value as defined by the architecture. - c) The value of a 'visible' field holds the system wide safe value - for the particular feature (except for MIDR_EL1, see section 4). - d) All other fields (i.e, invisible fields) are set to indicate - the feature is missing (as defined by the architecture). - -4. List of registers with visible features -------------------------------------------- - - 1) ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0 - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | TS | [55-52] | y | - |--------------------------------------------------| - | FHM | [51-48] | y | - |--------------------------------------------------| - | DP | [47-44] | y | - |--------------------------------------------------| - | SM4 | [43-40] | y | - |--------------------------------------------------| - | SM3 | [39-36] | y | - |--------------------------------------------------| - | SHA3 | [35-32] | y | - |--------------------------------------------------| - | RDM | [31-28] | y | - |--------------------------------------------------| - | ATOMICS | [23-20] | y | - |--------------------------------------------------| - | CRC32 | [19-16] | y | - |--------------------------------------------------| - | SHA2 | [15-12] | y | - |--------------------------------------------------| - | SHA1 | [11-8] | y | - |--------------------------------------------------| - | AES | [7-4] | y | - x--------------------------------------------------x - - - 2) ID_AA64PFR0_EL1 - Processor Feature Register 0 - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | DIT | [51-48] | y | - |--------------------------------------------------| - | SVE | [35-32] | y | - |--------------------------------------------------| - | GIC | [27-24] | n | - |--------------------------------------------------| - | AdvSIMD | [23-20] | y | - |--------------------------------------------------| - | FP | [19-16] | y | - |--------------------------------------------------| - | EL3 | [15-12] | n | - |--------------------------------------------------| - | EL2 | [11-8] | n | - |--------------------------------------------------| - | EL1 | [7-4] | n | - |--------------------------------------------------| - | EL0 | [3-0] | n | - x--------------------------------------------------x - - - 3) MIDR_EL1 - Main ID Register - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | Implementer | [31-24] | y | - |--------------------------------------------------| - | Variant | [23-20] | y | - |--------------------------------------------------| - | Architecture | [19-16] | y | - |--------------------------------------------------| - | PartNum | [15-4] | y | - |--------------------------------------------------| - | Revision | [3-0] | y | - x--------------------------------------------------x - - NOTE: The 'visible' fields of MIDR_EL1 will contain the value - as available on the CPU where it is fetched and is not a system - wide safe value. - - 4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1 - - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | GPI | [31-28] | y | - |--------------------------------------------------| - | GPA | [27-24] | y | - |--------------------------------------------------| - | LRCPC | [23-20] | y | - |--------------------------------------------------| - | FCMA | [19-16] | y | - |--------------------------------------------------| - | JSCVT | [15-12] | y | - |--------------------------------------------------| - | API | [11-8] | y | - |--------------------------------------------------| - | APA | [7-4] | y | - |--------------------------------------------------| - | DPB | [3-0] | y | - x--------------------------------------------------x - - 5) ID_AA64MMFR2_EL1 - Memory model feature register 2 - - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | AT | [35-32] | y | - x--------------------------------------------------x - - 6) ID_AA64ZFR0_EL1 - SVE feature ID register 0 - - x--------------------------------------------------x - | Name | bits | visible | - |--------------------------------------------------| - | SM4 | [43-40] | y | - |--------------------------------------------------| - | SHA3 | [35-32] | y | - |--------------------------------------------------| - | BitPerm | [19-16] | y | - |--------------------------------------------------| - | AES | [7-4] | y | - |--------------------------------------------------| - | SVEVer | [3-0] | y | - x--------------------------------------------------x - -Appendix I: Example ---------------------------- - -/* - * Sample program to demonstrate the MRS emulation ABI. - * - * Copyright (C) 2015-2016, ARM Ltd - * - * Author: Suzuki K Poulose - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include - -#define get_cpu_ftr(id) ({ \ - unsigned long __val; \ - asm("mrs %0, "#id : "=r" (__val)); \ - printf("%-20s: 0x%016lx\n", #id, __val); \ - }) - -int main(void) -{ - - if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { - fputs("CPUID registers unavailable\n", stderr); - return 1; - } - - get_cpu_ftr(ID_AA64ISAR0_EL1); - get_cpu_ftr(ID_AA64ISAR1_EL1); - get_cpu_ftr(ID_AA64MMFR0_EL1); - get_cpu_ftr(ID_AA64MMFR1_EL1); - get_cpu_ftr(ID_AA64PFR0_EL1); - get_cpu_ftr(ID_AA64PFR1_EL1); - get_cpu_ftr(ID_AA64DFR0_EL1); - get_cpu_ftr(ID_AA64DFR1_EL1); - - get_cpu_ftr(MIDR_EL1); - get_cpu_ftr(MPIDR_EL1); - get_cpu_ftr(REVIDR_EL1); - -#if 0 - /* Unexposed register access causes SIGILL */ - get_cpu_ftr(ID_MMFR0_EL1); -#endif - - return 0; -} - - - diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst new file mode 100644 index 000000000000..c7cbf4b571c0 --- /dev/null +++ b/Documentation/arm64/elf_hwcaps.rst @@ -0,0 +1,201 @@ +================ +ARM64 ELF hwcaps +================ + +This document describes the usage and semantics of the arm64 ELF hwcaps. + + +1. Introduction +--------------- + +Some hardware or software features are only available on some CPU +implementations, and/or with certain kernel configurations, but have no +architected discovery mechanism available to userspace code at EL0. The +kernel exposes the presence of these features to userspace through a set +of flags called hwcaps, exposed in the auxilliary vector. + +Userspace software can test for features by acquiring the AT_HWCAP or +AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant +flags are set, e.g.:: + + bool floating_point_is_present(void) + { + unsigned long hwcaps = getauxval(AT_HWCAP); + if (hwcaps & HWCAP_FP) + return true; + + return false; + } + +Where software relies on a feature described by a hwcap, it should check +the relevant hwcap flag to verify that the feature is present before +attempting to make use of the feature. + +Features cannot be probed reliably through other means. When a feature +is not available, attempting to use it may result in unpredictable +behaviour, and is not guaranteed to result in any reliable indication +that the feature is unavailable, such as a SIGILL. + + +2. Interpretation of hwcaps +--------------------------- + +The majority of hwcaps are intended to indicate the presence of features +which are described by architected ID registers inaccessible to +userspace code at EL0. These hwcaps are defined in terms of ID register +fields, and should be interpreted with reference to the definition of +these fields in the ARM Architecture Reference Manual (ARM ARM). + +Such hwcaps are described below in the form:: + + Functionality implied by idreg.field == val. + +Such hwcaps indicate the availability of functionality that the ARM ARM +defines as being present when idreg.field has value val, but do not +indicate that idreg.field is precisely equal to val, nor do they +indicate the absence of functionality implied by other values of +idreg.field. + +Other hwcaps may indicate the presence of features which cannot be +described by ID registers alone. These may be described without +reference to ID registers, and may refer to other documentation. + + +3. The hwcaps exposed in AT_HWCAP +--------------------------------- + +HWCAP_FP + Functionality implied by ID_AA64PFR0_EL1.FP == 0b0000. + +HWCAP_ASIMD + Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0000. + +HWCAP_EVTSTRM + The generic timer is configured to generate events at a frequency of + approximately 100KHz. + +HWCAP_AES + Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001. + +HWCAP_PMULL + Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010. + +HWCAP_SHA1 + Functionality implied by ID_AA64ISAR0_EL1.SHA1 == 0b0001. + +HWCAP_SHA2 + Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0001. + +HWCAP_CRC32 + Functionality implied by ID_AA64ISAR0_EL1.CRC32 == 0b0001. + +HWCAP_ATOMICS + Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0010. + +HWCAP_FPHP + Functionality implied by ID_AA64PFR0_EL1.FP == 0b0001. + +HWCAP_ASIMDHP + Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0001. + +HWCAP_CPUID + EL0 access to certain ID registers is available, to the extent + described by Documentation/arm64/cpu-feature-registers.rst. + + These ID registers may imply the availability of features. + +HWCAP_ASIMDRDM + Functionality implied by ID_AA64ISAR0_EL1.RDM == 0b0001. + +HWCAP_JSCVT + Functionality implied by ID_AA64ISAR1_EL1.JSCVT == 0b0001. + +HWCAP_FCMA + Functionality implied by ID_AA64ISAR1_EL1.FCMA == 0b0001. + +HWCAP_LRCPC + Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0001. + +HWCAP_DCPOP + Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001. + +HWCAP2_DCPODP + + Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. + +HWCAP_SHA3 + Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001. + +HWCAP_SM3 + Functionality implied by ID_AA64ISAR0_EL1.SM3 == 0b0001. + +HWCAP_SM4 + Functionality implied by ID_AA64ISAR0_EL1.SM4 == 0b0001. + +HWCAP_ASIMDDP + Functionality implied by ID_AA64ISAR0_EL1.DP == 0b0001. + +HWCAP_SHA512 + Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010. + +HWCAP_SVE + Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001. + +HWCAP2_SVE2 + + Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001. + +HWCAP2_SVEAES + + Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001. + +HWCAP2_SVEPMULL + + Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010. + +HWCAP2_SVEBITPERM + + Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001. + +HWCAP2_SVESHA3 + + Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001. + +HWCAP2_SVESM4 + + Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001. + +HWCAP_ASIMDFHM + Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001. + +HWCAP_DIT + Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001. + +HWCAP_USCAT + Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001. + +HWCAP_ILRCPC + Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010. + +HWCAP_FLAGM + Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001. + +HWCAP_SSBS + Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010. + +HWCAP_PACA + Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or + ID_AA64ISAR1_EL1.API == 0b0001, as described by + Documentation/arm64/pointer-authentication.rst. + +HWCAP_PACG + Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or + ID_AA64ISAR1_EL1.GPI == 0b0001, as described by + Documentation/arm64/pointer-authentication.rst. + + +4. Unused AT_HWCAP bits +----------------------- + +For interoperation with userspace, the kernel guarantees that bits 62 +and 63 of AT_HWCAP will always be returned as 0. diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.txt deleted file mode 100644 index b73a2519ecf2..000000000000 --- a/Documentation/arm64/elf_hwcaps.txt +++ /dev/null @@ -1,231 +0,0 @@ -ARM64 ELF hwcaps -================ - -This document describes the usage and semantics of the arm64 ELF hwcaps. - - -1. Introduction ---------------- - -Some hardware or software features are only available on some CPU -implementations, and/or with certain kernel configurations, but have no -architected discovery mechanism available to userspace code at EL0. The -kernel exposes the presence of these features to userspace through a set -of flags called hwcaps, exposed in the auxilliary vector. - -Userspace software can test for features by acquiring the AT_HWCAP or -AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant -flags are set, e.g. - -bool floating_point_is_present(void) -{ - unsigned long hwcaps = getauxval(AT_HWCAP); - if (hwcaps & HWCAP_FP) - return true; - - return false; -} - -Where software relies on a feature described by a hwcap, it should check -the relevant hwcap flag to verify that the feature is present before -attempting to make use of the feature. - -Features cannot be probed reliably through other means. When a feature -is not available, attempting to use it may result in unpredictable -behaviour, and is not guaranteed to result in any reliable indication -that the feature is unavailable, such as a SIGILL. - - -2. Interpretation of hwcaps ---------------------------- - -The majority of hwcaps are intended to indicate the presence of features -which are described by architected ID registers inaccessible to -userspace code at EL0. These hwcaps are defined in terms of ID register -fields, and should be interpreted with reference to the definition of -these fields in the ARM Architecture Reference Manual (ARM ARM). - -Such hwcaps are described below in the form: - - Functionality implied by idreg.field == val. - -Such hwcaps indicate the availability of functionality that the ARM ARM -defines as being present when idreg.field has value val, but do not -indicate that idreg.field is precisely equal to val, nor do they -indicate the absence of functionality implied by other values of -idreg.field. - -Other hwcaps may indicate the presence of features which cannot be -described by ID registers alone. These may be described without -reference to ID registers, and may refer to other documentation. - - -3. The hwcaps exposed in AT_HWCAP ---------------------------------- - -HWCAP_FP - - Functionality implied by ID_AA64PFR0_EL1.FP == 0b0000. - -HWCAP_ASIMD - - Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0000. - -HWCAP_EVTSTRM - - The generic timer is configured to generate events at a frequency of - approximately 100KHz. - -HWCAP_AES - - Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001. - -HWCAP_PMULL - - Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010. - -HWCAP_SHA1 - - Functionality implied by ID_AA64ISAR0_EL1.SHA1 == 0b0001. - -HWCAP_SHA2 - - Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0001. - -HWCAP_CRC32 - - Functionality implied by ID_AA64ISAR0_EL1.CRC32 == 0b0001. - -HWCAP_ATOMICS - - Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0010. - -HWCAP_FPHP - - Functionality implied by ID_AA64PFR0_EL1.FP == 0b0001. - -HWCAP_ASIMDHP - - Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0001. - -HWCAP_CPUID - - EL0 access to certain ID registers is available, to the extent - described by Documentation/arm64/cpu-feature-registers.txt. - - These ID registers may imply the availability of features. - -HWCAP_ASIMDRDM - - Functionality implied by ID_AA64ISAR0_EL1.RDM == 0b0001. - -HWCAP_JSCVT - - Functionality implied by ID_AA64ISAR1_EL1.JSCVT == 0b0001. - -HWCAP_FCMA - - Functionality implied by ID_AA64ISAR1_EL1.FCMA == 0b0001. - -HWCAP_LRCPC - - Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0001. - -HWCAP_DCPOP - - Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001. - -HWCAP2_DCPODP - - Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. - -HWCAP_SHA3 - - Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001. - -HWCAP_SM3 - - Functionality implied by ID_AA64ISAR0_EL1.SM3 == 0b0001. - -HWCAP_SM4 - - Functionality implied by ID_AA64ISAR0_EL1.SM4 == 0b0001. - -HWCAP_ASIMDDP - - Functionality implied by ID_AA64ISAR0_EL1.DP == 0b0001. - -HWCAP_SHA512 - - Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010. - -HWCAP_SVE - - Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001. - -HWCAP2_SVE2 - - Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001. - -HWCAP2_SVEAES - - Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001. - -HWCAP2_SVEPMULL - - Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010. - -HWCAP2_SVEBITPERM - - Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001. - -HWCAP2_SVESHA3 - - Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001. - -HWCAP2_SVESM4 - - Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001. - -HWCAP_ASIMDFHM - - Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001. - -HWCAP_DIT - - Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001. - -HWCAP_USCAT - - Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001. - -HWCAP_ILRCPC - - Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010. - -HWCAP_FLAGM - - Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001. - -HWCAP_SSBS - - Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010. - -HWCAP_PACA - - Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or - ID_AA64ISAR1_EL1.API == 0b0001, as described by - Documentation/arm64/pointer-authentication.txt. - -HWCAP_PACG - - Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or - ID_AA64ISAR1_EL1.GPI == 0b0001, as described by - Documentation/arm64/pointer-authentication.txt. - - -4. Unused AT_HWCAP bits ------------------------ - -For interoperation with userspace, the kernel guarantees that bits 62 -and 63 of AT_HWCAP will always be returned as 0. diff --git a/Documentation/arm64/hugetlbpage.rst b/Documentation/arm64/hugetlbpage.rst new file mode 100644 index 000000000000..b44f939e5210 --- /dev/null +++ b/Documentation/arm64/hugetlbpage.rst @@ -0,0 +1,41 @@ +==================== +HugeTLBpage on ARM64 +==================== + +Hugepage relies on making efficient use of TLBs to improve performance of +address translations. The benefit depends on both - + + - the size of hugepages + - size of entries supported by the TLBs + +The ARM64 port supports two flavours of hugepages. + +1) Block mappings at the pud/pmd level +-------------------------------------- + +These are regular hugepages where a pmd or a pud page table entry points to a +block of memory. Regardless of the supported size of entries in TLB, block +mappings reduce the depth of page table walk needed to translate hugepage +addresses. + +2) Using the Contiguous bit +--------------------------- + +The architecture provides a contiguous bit in the translation table entries +(D4.5.3, ARM DDI 0487C.a) that hints to the MMU to indicate that it is one of a +contiguous set of entries that can be cached in a single TLB entry. + +The contiguous bit is used in Linux to increase the mapping size at the pmd and +pte (last) level. The number of supported contiguous entries varies by page size +and level of the page table. + + +The following hugepage sizes are supported - + + ====== ======== ==== ======== === + - CONT PTE PMD CONT PMD PUD + ====== ======== ==== ======== === + 4K: 64K 2M 32M 1G + 16K: 2M 32M 1G + 64K: 2M 512M 16G + ====== ======== ==== ======== === diff --git a/Documentation/arm64/hugetlbpage.txt b/Documentation/arm64/hugetlbpage.txt deleted file mode 100644 index cfae87dc653b..000000000000 --- a/Documentation/arm64/hugetlbpage.txt +++ /dev/null @@ -1,38 +0,0 @@ -HugeTLBpage on ARM64 -==================== - -Hugepage relies on making efficient use of TLBs to improve performance of -address translations. The benefit depends on both - - - - the size of hugepages - - size of entries supported by the TLBs - -The ARM64 port supports two flavours of hugepages. - -1) Block mappings at the pud/pmd level --------------------------------------- - -These are regular hugepages where a pmd or a pud page table entry points to a -block of memory. Regardless of the supported size of entries in TLB, block -mappings reduce the depth of page table walk needed to translate hugepage -addresses. - -2) Using the Contiguous bit ---------------------------- - -The architecture provides a contiguous bit in the translation table entries -(D4.5.3, ARM DDI 0487C.a) that hints to the MMU to indicate that it is one of a -contiguous set of entries that can be cached in a single TLB entry. - -The contiguous bit is used in Linux to increase the mapping size at the pmd and -pte (last) level. The number of supported contiguous entries varies by page size -and level of the page table. - - -The following hugepage sizes are supported - - - CONT PTE PMD CONT PMD PUD - -------- --- -------- --- - 4K: 64K 2M 32M 1G - 16K: 2M 32M 1G - 64K: 2M 512M 16G diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst new file mode 100644 index 000000000000..018b7836ecb7 --- /dev/null +++ b/Documentation/arm64/index.rst @@ -0,0 +1,28 @@ +:orphan: + +================== +ARM64 Architecture +================== + +.. toctree:: + :maxdepth: 1 + + acpi_object_usage + arm-acpi + booting + cpu-feature-registers + elf_hwcaps + hugetlbpage + legacy_instructions + memory + pointer-authentication + silicon-errata + sve + tagged-pointers + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/arm64/legacy_instructions.rst b/Documentation/arm64/legacy_instructions.rst new file mode 100644 index 000000000000..54401b22cb8f --- /dev/null +++ b/Documentation/arm64/legacy_instructions.rst @@ -0,0 +1,68 @@ +=================== +Legacy instructions +=================== + +The arm64 port of the Linux kernel provides infrastructure to support +emulation of instructions which have been deprecated, or obsoleted in +the architecture. The infrastructure code uses undefined instruction +hooks to support emulation. Where available it also allows turning on +the instruction execution in hardware. + +The emulation mode can be controlled by writing to sysctl nodes +(/proc/sys/abi). The following explains the different execution +behaviours and the corresponding values of the sysctl nodes - + +* Undef + Value: 0 + + Generates undefined instruction abort. Default for instructions that + have been obsoleted in the architecture, e.g., SWP + +* Emulate + Value: 1 + + Uses software emulation. To aid migration of software, in this mode + usage of emulated instruction is traced as well as rate limited + warnings are issued. This is the default for deprecated + instructions, .e.g., CP15 barriers + +* Hardware Execution + Value: 2 + + Although marked as deprecated, some implementations may support the + enabling/disabling of hardware support for the execution of these + instructions. Using hardware execution generally provides better + performance, but at the loss of ability to gather runtime statistics + about the use of the deprecated instructions. + +The default mode depends on the status of the instruction in the +architecture. Deprecated instructions should default to emulation +while obsolete instructions must be undefined by default. + +Note: Instruction emulation may not be possible in all cases. See +individual instruction notes for further information. + +Supported legacy instructions +----------------------------- +* SWP{B} + +:Node: /proc/sys/abi/swp +:Status: Obsolete +:Default: Undef (0) + +* CP15 Barriers + +:Node: /proc/sys/abi/cp15_barrier +:Status: Deprecated +:Default: Emulate (1) + +* SETEND + +:Node: /proc/sys/abi/setend +:Status: Deprecated +:Default: Emulate (1)* + + Note: All the cpus on the system must have mixed endian support at EL0 + for this feature to be enabled. If a new CPU - which doesn't support mixed + endian - is hotplugged in after this feature has been enabled, there could + be unexpected results in the application. diff --git a/Documentation/arm64/legacy_instructions.txt b/Documentation/arm64/legacy_instructions.txt deleted file mode 100644 index 01bf3d9fac85..000000000000 --- a/Documentation/arm64/legacy_instructions.txt +++ /dev/null @@ -1,57 +0,0 @@ -The arm64 port of the Linux kernel provides infrastructure to support -emulation of instructions which have been deprecated, or obsoleted in -the architecture. The infrastructure code uses undefined instruction -hooks to support emulation. Where available it also allows turning on -the instruction execution in hardware. - -The emulation mode can be controlled by writing to sysctl nodes -(/proc/sys/abi). The following explains the different execution -behaviours and the corresponding values of the sysctl nodes - - -* Undef - Value: 0 - Generates undefined instruction abort. Default for instructions that - have been obsoleted in the architecture, e.g., SWP - -* Emulate - Value: 1 - Uses software emulation. To aid migration of software, in this mode - usage of emulated instruction is traced as well as rate limited - warnings are issued. This is the default for deprecated - instructions, .e.g., CP15 barriers - -* Hardware Execution - Value: 2 - Although marked as deprecated, some implementations may support the - enabling/disabling of hardware support for the execution of these - instructions. Using hardware execution generally provides better - performance, but at the loss of ability to gather runtime statistics - about the use of the deprecated instructions. - -The default mode depends on the status of the instruction in the -architecture. Deprecated instructions should default to emulation -while obsolete instructions must be undefined by default. - -Note: Instruction emulation may not be possible in all cases. See -individual instruction notes for further information. - -Supported legacy instructions ------------------------------ -* SWP{B} -Node: /proc/sys/abi/swp -Status: Obsolete -Default: Undef (0) - -* CP15 Barriers -Node: /proc/sys/abi/cp15_barrier -Status: Deprecated -Default: Emulate (1) - -* SETEND -Node: /proc/sys/abi/setend -Status: Deprecated -Default: Emulate (1)* -Note: All the cpus on the system must have mixed endian support at EL0 -for this feature to be enabled. If a new CPU - which doesn't support mixed -endian - is hotplugged in after this feature has been enabled, there could -be unexpected results in the application. diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst new file mode 100644 index 000000000000..464b880fc4b7 --- /dev/null +++ b/Documentation/arm64/memory.rst @@ -0,0 +1,98 @@ +============================== +Memory Layout on AArch64 Linux +============================== + +Author: Catalin Marinas + +This document describes the virtual memory layout used by the AArch64 +Linux kernel. The architecture allows up to 4 levels of translation +tables with a 4KB page size and up to 3 levels with a 64KB page size. + +AArch64 Linux uses either 3 levels or 4 levels of translation tables +with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit +(256TB) virtual addresses, respectively, for both user and kernel. With +64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB) +virtual address, are used but the memory layout is the same. + +User addresses have bits 63:48 set to 0 while the kernel addresses have +the same bits set to 1. TTBRx selection is given by bit 63 of the +virtual address. The swapper_pg_dir contains only kernel (global) +mappings while the user pgd contains only user (non-global) mappings. +The swapper_pg_dir address is written to TTBR1 and never written to +TTBR0. + + +AArch64 Linux memory layout with 4KB pages + 3 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000007fffffffff 512GB user + ffffff8000000000 ffffffffffffffff 512GB kernel + + +AArch64 Linux memory layout with 4KB pages + 4 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000ffffffffffff 256TB user + ffff000000000000 ffffffffffffffff 256TB kernel + + +AArch64 Linux memory layout with 64KB pages + 2 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 000003ffffffffff 4TB user + fffffc0000000000 ffffffffffffffff 4TB kernel + + +AArch64 Linux memory layout with 64KB pages + 3 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000ffffffffffff 256TB user + ffff000000000000 ffffffffffffffff 256TB kernel + + +For details of the virtual kernel memory layout please see the kernel +booting log. + + +Translation table lookup with 4KB pages:: + + +--------+--------+--------+--------+--------+--------+--------+--------+ + |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| + +--------+--------+--------+--------+--------+--------+--------+--------+ + | | | | | | + | | | | | v + | | | | | [11:0] in-page offset + | | | | +-> [20:12] L3 index + | | | +-----------> [29:21] L2 index + | | +---------------------> [38:30] L1 index + | +-------------------------------> [47:39] L0 index + +-------------------------------------------------> [63] TTBR0/1 + + +Translation table lookup with 64KB pages:: + + +--------+--------+--------+--------+--------+--------+--------+--------+ + |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| + +--------+--------+--------+--------+--------+--------+--------+--------+ + | | | | | + | | | | v + | | | | [15:0] in-page offset + | | | +----------> [28:16] L3 index + | | +--------------------------> [41:29] L2 index + | +-------------------------------> [47:42] L1 index + +-------------------------------------------------> [63] TTBR0/1 + + +When using KVM without the Virtualization Host Extensions, the +hypervisor maps kernel pages in EL2 at a fixed (and potentially +random) offset from the linear mapping. See the kern_hyp_va macro and +kvm_update_va_mask function for more details. MMIO devices such as +GICv2 gets mapped next to the HYP idmap page, as do vectors when +ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs. + +When using KVM with the Virtualization Host Extensions, no additional +mappings are created, since the host kernel runs directly in EL2. diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt deleted file mode 100644 index c5dab30d3389..000000000000 --- a/Documentation/arm64/memory.txt +++ /dev/null @@ -1,97 +0,0 @@ - Memory Layout on AArch64 Linux - ============================== - -Author: Catalin Marinas - -This document describes the virtual memory layout used by the AArch64 -Linux kernel. The architecture allows up to 4 levels of translation -tables with a 4KB page size and up to 3 levels with a 64KB page size. - -AArch64 Linux uses either 3 levels or 4 levels of translation tables -with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit -(256TB) virtual addresses, respectively, for both user and kernel. With -64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB) -virtual address, are used but the memory layout is the same. - -User addresses have bits 63:48 set to 0 while the kernel addresses have -the same bits set to 1. TTBRx selection is given by bit 63 of the -virtual address. The swapper_pg_dir contains only kernel (global) -mappings while the user pgd contains only user (non-global) mappings. -The swapper_pg_dir address is written to TTBR1 and never written to -TTBR0. - - -AArch64 Linux memory layout with 4KB pages + 3 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000007fffffffff 512GB user -ffffff8000000000 ffffffffffffffff 512GB kernel - - -AArch64 Linux memory layout with 4KB pages + 4 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000ffffffffffff 256TB user -ffff000000000000 ffffffffffffffff 256TB kernel - - -AArch64 Linux memory layout with 64KB pages + 2 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 000003ffffffffff 4TB user -fffffc0000000000 ffffffffffffffff 4TB kernel - - -AArch64 Linux memory layout with 64KB pages + 3 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000ffffffffffff 256TB user -ffff000000000000 ffffffffffffffff 256TB kernel - - -For details of the virtual kernel memory layout please see the kernel -booting log. - - -Translation table lookup with 4KB pages: - -+--------+--------+--------+--------+--------+--------+--------+--------+ -|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| -+--------+--------+--------+--------+--------+--------+--------+--------+ - | | | | | | - | | | | | v - | | | | | [11:0] in-page offset - | | | | +-> [20:12] L3 index - | | | +-----------> [29:21] L2 index - | | +---------------------> [38:30] L1 index - | +-------------------------------> [47:39] L0 index - +-------------------------------------------------> [63] TTBR0/1 - - -Translation table lookup with 64KB pages: - -+--------+--------+--------+--------+--------+--------+--------+--------+ -|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| -+--------+--------+--------+--------+--------+--------+--------+--------+ - | | | | | - | | | | v - | | | | [15:0] in-page offset - | | | +----------> [28:16] L3 index - | | +--------------------------> [41:29] L2 index - | +-------------------------------> [47:42] L1 index - +-------------------------------------------------> [63] TTBR0/1 - - -When using KVM without the Virtualization Host Extensions, the -hypervisor maps kernel pages in EL2 at a fixed (and potentially -random) offset from the linear mapping. See the kern_hyp_va macro and -kvm_update_va_mask function for more details. MMIO devices such as -GICv2 gets mapped next to the HYP idmap page, as do vectors when -ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs. - -When using KVM with the Virtualization Host Extensions, no additional -mappings are created, since the host kernel runs directly in EL2. diff --git a/Documentation/arm64/pointer-authentication.rst b/Documentation/arm64/pointer-authentication.rst new file mode 100644 index 000000000000..30b2ab06526b --- /dev/null +++ b/Documentation/arm64/pointer-authentication.rst @@ -0,0 +1,109 @@ +======================================= +Pointer authentication in AArch64 Linux +======================================= + +Author: Mark Rutland + +Date: 2017-07-19 + +This document briefly describes the provision of pointer authentication +functionality in AArch64 Linux. + + +Architecture overview +--------------------- + +The ARMv8.3 Pointer Authentication extension adds primitives that can be +used to mitigate certain classes of attack where an attacker can corrupt +the contents of some memory (e.g. the stack). + +The extension uses a Pointer Authentication Code (PAC) to determine +whether pointers have been modified unexpectedly. A PAC is derived from +a pointer, another value (such as the stack pointer), and a secret key +held in system registers. + +The extension adds instructions to insert a valid PAC into a pointer, +and to verify/remove the PAC from a pointer. The PAC occupies a number +of high-order bits of the pointer, which varies dependent on the +configured virtual address size and whether pointer tagging is in use. + +A subset of these instructions have been allocated from the HINT +encoding space. In the absence of the extension (or when disabled), +these instructions behave as NOPs. Applications and libraries using +these instructions operate correctly regardless of the presence of the +extension. + +The extension provides five separate keys to generate PACs - two for +instruction addresses (APIAKey, APIBKey), two for data addresses +(APDAKey, APDBKey), and one for generic authentication (APGAKey). + + +Basic support +------------- + +When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is +present, the kernel will assign random key values to each process at +exec*() time. The keys are shared by all threads within the process, and +are preserved across fork(). + +Presence of address authentication functionality is advertised via +HWCAP_PACA, and generic authentication functionality via HWCAP_PACG. + +The number of bits that the PAC occupies in a pointer is 55 minus the +virtual address size configured by the kernel. For example, with a +virtual address size of 48, the PAC is 7 bits wide. + +Recent versions of GCC can compile code with APIAKey-based return +address protection when passed the -msign-return-address option. This +uses instructions in the HINT space (unless -march=armv8.3-a or higher +is also passed), and such code can run on systems without the pointer +authentication extension. + +In addition to exec(), keys can also be reinitialized to random values +using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY, +PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY +specifies which keys are to be reinitialized; specifying 0 means "all +keys". + + +Debugging +--------- + +When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address +authentication is present, the kernel will expose the position of TTBR0 +PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which +userspace can acquire via PTRACE_GETREGSET. + +The regset is exposed only when HWCAP_PACA is set. Separate masks are +exposed for data pointers and instruction pointers, as the set of PAC +bits can vary between the two. Note that the masks apply to TTBR0 +addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel +pointers). + +Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel +will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct +user_pac_address_keys and struct user_pac_generic_keys). These can be +used to get and set the keys for a thread. + + +Virtualization +-------------- + +Pointer authentication is enabled in KVM guest when each virtual cpu is +initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and +requesting these two separate cpu features to be enabled. The current KVM +guest implementation works by enabling both features together, so both +these userspace flags are checked before enabling pointer authentication. +The separate userspace flag will allow to have no userspace ABI changes +if support is added in the future to allow these two features to be +enabled independently of one another. + +As Arm Architecture specifies that Pointer Authentication feature is +implemented along with the VHE feature so KVM arm64 ptrauth code relies +on VHE mode to be present. + +Additionally, when these vcpu feature flags are not set then KVM will +filter out the Pointer Authentication system key registers from +KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID +register. Any attempt to use the Pointer Authentication instructions will +result in an UNDEFINED exception being injected into the guest. diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt deleted file mode 100644 index fc71b33de87e..000000000000 --- a/Documentation/arm64/pointer-authentication.txt +++ /dev/null @@ -1,107 +0,0 @@ -Pointer authentication in AArch64 Linux -======================================= - -Author: Mark Rutland -Date: 2017-07-19 - -This document briefly describes the provision of pointer authentication -functionality in AArch64 Linux. - - -Architecture overview ---------------------- - -The ARMv8.3 Pointer Authentication extension adds primitives that can be -used to mitigate certain classes of attack where an attacker can corrupt -the contents of some memory (e.g. the stack). - -The extension uses a Pointer Authentication Code (PAC) to determine -whether pointers have been modified unexpectedly. A PAC is derived from -a pointer, another value (such as the stack pointer), and a secret key -held in system registers. - -The extension adds instructions to insert a valid PAC into a pointer, -and to verify/remove the PAC from a pointer. The PAC occupies a number -of high-order bits of the pointer, which varies dependent on the -configured virtual address size and whether pointer tagging is in use. - -A subset of these instructions have been allocated from the HINT -encoding space. In the absence of the extension (or when disabled), -these instructions behave as NOPs. Applications and libraries using -these instructions operate correctly regardless of the presence of the -extension. - -The extension provides five separate keys to generate PACs - two for -instruction addresses (APIAKey, APIBKey), two for data addresses -(APDAKey, APDBKey), and one for generic authentication (APGAKey). - - -Basic support -------------- - -When CONFIG_ARM64_PTR_AUTH is selected, and relevant HW support is -present, the kernel will assign random key values to each process at -exec*() time. The keys are shared by all threads within the process, and -are preserved across fork(). - -Presence of address authentication functionality is advertised via -HWCAP_PACA, and generic authentication functionality via HWCAP_PACG. - -The number of bits that the PAC occupies in a pointer is 55 minus the -virtual address size configured by the kernel. For example, with a -virtual address size of 48, the PAC is 7 bits wide. - -Recent versions of GCC can compile code with APIAKey-based return -address protection when passed the -msign-return-address option. This -uses instructions in the HINT space (unless -march=armv8.3-a or higher -is also passed), and such code can run on systems without the pointer -authentication extension. - -In addition to exec(), keys can also be reinitialized to random values -using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY, -PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY and PR_PAC_APGAKEY -specifies which keys are to be reinitialized; specifying 0 means "all -keys". - - -Debugging ---------- - -When CONFIG_ARM64_PTR_AUTH is selected, and HW support for address -authentication is present, the kernel will expose the position of TTBR0 -PAC bits in the NT_ARM_PAC_MASK regset (struct user_pac_mask), which -userspace can acquire via PTRACE_GETREGSET. - -The regset is exposed only when HWCAP_PACA is set. Separate masks are -exposed for data pointers and instruction pointers, as the set of PAC -bits can vary between the two. Note that the masks apply to TTBR0 -addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel -pointers). - -Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel -will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct -user_pac_address_keys and struct user_pac_generic_keys). These can be -used to get and set the keys for a thread. - - -Virtualization --------------- - -Pointer authentication is enabled in KVM guest when each virtual cpu is -initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and -requesting these two separate cpu features to be enabled. The current KVM -guest implementation works by enabling both features together, so both -these userspace flags are checked before enabling pointer authentication. -The separate userspace flag will allow to have no userspace ABI changes -if support is added in the future to allow these two features to be -enabled independently of one another. - -As Arm Architecture specifies that Pointer Authentication feature is -implemented along with the VHE feature so KVM arm64 ptrauth code relies -on VHE mode to be present. - -Additionally, when these vcpu feature flags are not set then KVM will -filter out the Pointer Authentication system key registers from -KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID -register. Any attempt to use the Pointer Authentication instructions will -result in an UNDEFINED exception being injected into the guest. diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst new file mode 100644 index 000000000000..c792774be59e --- /dev/null +++ b/Documentation/arm64/silicon-errata.rst @@ -0,0 +1,131 @@ +======================================= +Silicon Errata and Software Workarounds +======================================= + +Author: Will Deacon + +Date : 27 November 2015 + +It is an unfortunate fact of life that hardware is often produced with +so-called "errata", which can cause it to deviate from the architecture +under specific circumstances. For hardware produced by ARM, these +errata are broadly classified into the following categories: + + ========== ======================================================== + Category A A critical error without a viable workaround. + Category B A significant or critical error with an acceptable + workaround. + Category C A minor error that is not expected to occur under normal + operation. + ========== ======================================================== + +For more information, consult one of the "Software Developers Errata +Notice" documents available on infocenter.arm.com (registration +required). + +As far as Linux is concerned, Category B errata may require some special +treatment in the operating system. For example, avoiding a particular +sequence of code, or configuring the processor in a particular way. A +less common situation may require similar actions in order to declassify +a Category A erratum into a Category C erratum. These are collectively +known as "software workarounds" and are only required in the minority of +cases (e.g. those cases that both require a non-secure workaround *and* +can be triggered by Linux). + +For software workarounds that may adversely impact systems unaffected by +the erratum in question, a Kconfig entry is added under "Kernel +Features" -> "ARM errata workarounds via the alternatives framework". +These are enabled by default and patched in at runtime when an affected +CPU is detected. For less-intrusive workarounds, a Kconfig option is not +available and the code is structured (preferably with a comment) in such +a way that the erratum will not be hit. + +This approach can make it slightly onerous to determine exactly which +errata are worked around in an arbitrary kernel source tree, so this +file acts as a registry of software workarounds in the Linux Kernel and +will be updated when new workarounds are committed and backported to +stable kernels. + ++----------------+-----------------+-----------------+-----------------------------+ +| Implementor | Component | Erratum ID | Kconfig | ++================+=================+=================+=============================+ +| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A57 | #852523 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A72 | #853709 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | ++----------------+-----------------+-----------------+-----------------------------+ +| ARM | MMU-500 | #841119,826419 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX ITS | #22375,24313 | CAVIUM_ERRATUM_22375 | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX SMMUv2 | #27704 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX2 SMMUv3| #74 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX2 SMMUv3| #126 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | ++----------------+-----------------+-----------------+-----------------------------+ +| Hisilicon | Hip0{6,7} | #161010701 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | ++----------------+-----------------+-----------------+-----------------------------+ +| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | ++----------------+-----------------+-----------------+-----------------------------+ +| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | ++----------------+-----------------+-----------------+-----------------------------+ +| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | ++----------------+-----------------+-----------------+-----------------------------+ +| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | ++----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt deleted file mode 100644 index 2735462d5958..000000000000 --- a/Documentation/arm64/silicon-errata.txt +++ /dev/null @@ -1,88 +0,0 @@ - Silicon Errata and Software Workarounds - ======================================= - -Author: Will Deacon -Date : 27 November 2015 - -It is an unfortunate fact of life that hardware is often produced with -so-called "errata", which can cause it to deviate from the architecture -under specific circumstances. For hardware produced by ARM, these -errata are broadly classified into the following categories: - - Category A: A critical error without a viable workaround. - Category B: A significant or critical error with an acceptable - workaround. - Category C: A minor error that is not expected to occur under normal - operation. - -For more information, consult one of the "Software Developers Errata -Notice" documents available on infocenter.arm.com (registration -required). - -As far as Linux is concerned, Category B errata may require some special -treatment in the operating system. For example, avoiding a particular -sequence of code, or configuring the processor in a particular way. A -less common situation may require similar actions in order to declassify -a Category A erratum into a Category C erratum. These are collectively -known as "software workarounds" and are only required in the minority of -cases (e.g. those cases that both require a non-secure workaround *and* -can be triggered by Linux). - -For software workarounds that may adversely impact systems unaffected by -the erratum in question, a Kconfig entry is added under "Kernel -Features" -> "ARM errata workarounds via the alternatives framework". -These are enabled by default and patched in at runtime when an affected -CPU is detected. For less-intrusive workarounds, a Kconfig option is not -available and the code is structured (preferably with a comment) in such -a way that the erratum will not be hit. - -This approach can make it slightly onerous to determine exactly which -errata are worked around in an arbitrary kernel source tree, so this -file acts as a registry of software workarounds in the Linux Kernel and -will be updated when new workarounds are committed and backported to -stable kernels. - -| Implementor | Component | Erratum ID | Kconfig | -+----------------+-----------------+-----------------+-----------------------------+ -| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | -| | | | | -| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | -| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | -| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | -| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | -| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | -| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | -| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | -| ARM | Cortex-A57 | #852523 | N/A | -| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | -| ARM | Cortex-A72 | #853709 | N/A | -| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | -| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | -| ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 | -| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 | -| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 | -| ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | -| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | -| ARM | MMU-500 | #841119,826419 | N/A | -| | | | | -| Cavium | ThunderX ITS | #22375,24313 | CAVIUM_ERRATUM_22375 | -| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 | -| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | -| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | -| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 | -| Cavium | ThunderX SMMUv2 | #27704 | N/A | -| Cavium | ThunderX2 SMMUv3| #74 | N/A | -| Cavium | ThunderX2 SMMUv3| #126 | N/A | -| | | | | -| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | -| | | | | -| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | -| Hisilicon | Hip0{6,7} | #161010701 | N/A | -| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | -| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A | -| | | | | -| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | -| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | -| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | -| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | -| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst new file mode 100644 index 000000000000..38422ab249dd --- /dev/null +++ b/Documentation/arm64/sve.rst @@ -0,0 +1,529 @@ +=================================================== +Scalable Vector Extension support for AArch64 Linux +=================================================== + +Author: Dave Martin + +Date: 4 August 2017 + +This document outlines briefly the interface provided to userspace by Linux in +order to support use of the ARM Scalable Vector Extension (SVE). + +This is an outline of the most important features and issues only and not +intended to be exhaustive. + +This document does not aim to describe the SVE architecture or programmer's +model. To aid understanding, a minimal description of relevant programmer's +model features for SVE is included in Appendix A. + + +1. General +----------- + +* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are + tracked per-thread. + +* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector + AT_HWCAP entry. Presence of this flag implies the presence of the SVE + instructions and registers, and the Linux-specific system interfaces + described in this document. SVE is reported in /proc/cpuinfo as "sve". + +* Support for the execution of SVE instructions in userspace can also be + detected by reading the CPU ID register ID_AA64PFR0_EL1 using an MRS + instruction, and checking that the value of the SVE field is nonzero. [3] + + It does not guarantee the presence of the system interfaces described in the + following sections: software that needs to verify that those interfaces are + present must check for HWCAP_SVE instead. + +* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also + be reported in the AT_HWCAP2 aux vector entry. In addition to this, + optional extensions to SVE2 may be reported by the presence of: + + HWCAP2_SVE2 + HWCAP2_SVEAES + HWCAP2_SVEPMULL + HWCAP2_SVEBITPERM + HWCAP2_SVESHA3 + HWCAP2_SVESM4 + + This list may be extended over time as the SVE architecture evolves. + + These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1, + which userspace can read using an MRS instruction. See elf_hwcaps.txt and + cpu-feature-registers.txt for details. + +* Debuggers should restrict themselves to interacting with the target via the + NT_ARM_SVE regset. The recommended way of detecting support for this regset + is to connect to a target process first and then attempt a + ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). + + +2. Vector length terminology +----------------------------- + +The size of an SVE vector (Z) register is referred to as the "vector length". + +To avoid confusion about the units used to express vector length, the kernel +adopts the following conventions: + +* Vector length (VL) = size of a Z-register in bytes + +* Vector quadwords (VQ) = size of a Z-register in units of 128 bits + +(So, VL = 16 * VQ.) + +The VQ convention is used where the underlying granularity is important, such +as in data structure definitions. In most other situations, the VL convention +is used. This is consistent with the meaning of the "VL" pseudo-register in +the SVE instruction set architecture. + + +3. System call behaviour +------------------------- + +* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of + Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR + become unspecified on return from a syscall. + +* The SVE registers are not used to pass arguments to or receive results from + any syscall. + +* In practice the affected registers/bits will be preserved or will be replaced + with zeros on return from a syscall, but userspace should not make + assumptions about this. The kernel behaviour may vary on a case-by-case + basis. + +* All other SVE state of a thread, including the currently configured vector + length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector + length (if any), is preserved across all syscalls, subject to the specific + exceptions for execve() described in section 6. + + In particular, on return from a fork() or clone(), the parent and new child + process or thread share identical SVE configuration, matching that of the + parent before the call. + + +4. Signal handling +------------------- + +* A new signal frame record sve_context encodes the SVE registers on signal + delivery. [1] + +* This record is supplementary to fpsimd_context. The FPSR and FPCR registers + are only present in fpsimd_context. For convenience, the content of V0..V31 + is duplicated between sve_context and fpsimd_context. + +* The signal frame record for SVE always contains basic metadata, in particular + the thread's vector length (in sve_context.vl). + +* The SVE registers may or may not be included in the record, depending on + whether the registers are live for the thread. The registers are present if + and only if: + sve_context.head.size >= SVE_SIG_CONTEXT_SIZE(sve_vq_from_vl(sve_context.vl)). + +* If the registers are present, the remainder of the record has a vl-dependent + size and layout. Macros SVE_SIG_* are defined [1] to facilitate access to + the members. + +* If the SVE context is too big to fit in sigcontext.__reserved[], then extra + space is allocated on the stack, an extra_context record is written in + __reserved[] referencing this space. sve_context is then written in the + extra space. Refer to [1] for further details about this mechanism. + + +5. Signal return +----------------- + +When returning from a signal handler: + +* If there is no sve_context record in the signal frame, or if the record is + present but contains no register data as desribed in the previous section, + then the SVE registers/bits become non-live and take unspecified values. + +* If sve_context is present in the signal frame and contains full register + data, the SVE registers become live and are populated with the specified + data. However, for backward compatibility reasons, bits [127:0] of Z0..Z31 + are always restored from the corresponding members of fpsimd_context.vregs[] + and not from sve_context. The remaining bits are restored from sve_context. + +* Inclusion of fpsimd_context in the signal frame remains mandatory, + irrespective of whether sve_context is present or not. + +* The vector length cannot be changed via signal return. If sve_context.vl in + the signal frame does not match the current vector length, the signal return + attempt is treated as illegal, resulting in a forced SIGSEGV. + + +6. prctl extensions +-------------------- + +Some new prctl() calls are added to allow programs to manage the SVE vector +length: + +prctl(PR_SVE_SET_VL, unsigned long arg) + + Sets the vector length of the calling thread and related flags, where + arg == vl | flags. Other threads of the calling process are unaffected. + + vl is the desired vector length, where sve_vl_valid(vl) must be true. + + flags: + + PR_SVE_SET_VL_INHERIT + + Inherit the current vector length across execve(). Otherwise, the + vector length is reset to the system default at execve(). (See + Section 9.) + + PR_SVE_SET_VL_ONEXEC + + Defer the requested vector length change until the next execve() + performed by this thread. + + The effect is equivalent to implicit exceution of the following + call immediately after the next execve() (if any) by the thread: + + prctl(PR_SVE_SET_VL, arg & ~PR_SVE_SET_VL_ONEXEC) + + This allows launching of a new program with a different vector + length, while avoiding runtime side effects in the caller. + + + Without PR_SVE_SET_VL_ONEXEC, the requested change takes effect + immediately. + + + Return value: a nonnegative on success, or a negative value on error: + EINVAL: SVE not supported, invalid vector length requested, or + invalid flags. + + + On success: + + * Either the calling thread's vector length or the deferred vector length + to be applied at the next execve() by the thread (dependent on whether + PR_SVE_SET_VL_ONEXEC is present in arg), is set to the largest value + supported by the system that is less than or equal to vl. If vl == + SVE_VL_MAX, the value set will be the largest value supported by the + system. + + * Any previously outstanding deferred vector length change in the calling + thread is cancelled. + + * The returned value describes the resulting configuration, encoded as for + PR_SVE_GET_VL. The vector length reported in this value is the new + current vector length for this thread if PR_SVE_SET_VL_ONEXEC was not + present in arg; otherwise, the reported vector length is the deferred + vector length that will be applied at the next execve() by the calling + thread. + + * Changing the vector length causes all of P0..P15, FFR and all bits of + Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become + unspecified. Calling PR_SVE_SET_VL with vl equal to the thread's current + vector length, or calling PR_SVE_SET_VL with the PR_SVE_SET_VL_ONEXEC + flag, does not constitute a change to the vector length for this purpose. + + +prctl(PR_SVE_GET_VL) + + Gets the vector length of the calling thread. + + The following flag may be OR-ed into the result: + + PR_SVE_SET_VL_INHERIT + + Vector length will be inherited across execve(). + + There is no way to determine whether there is an outstanding deferred + vector length change (which would only normally be the case between a + fork() or vfork() and the corresponding execve() in typical use). + + To extract the vector length from the result, and it with + PR_SVE_VL_LEN_MASK. + + Return value: a nonnegative value on success, or a negative value on error: + EINVAL: SVE not supported. + + +7. ptrace extensions +--------------------- + +* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and + PTRACE_SETREGSET. + + Refer to [2] for definitions. + +The regset data starts with struct user_sve_header, containing: + + size + + Size of the complete regset, in bytes. + This depends on vl and possibly on other things in the future. + + If a call to PTRACE_GETREGSET requests less data than the value of + size, the caller can allocate a larger buffer and retry in order to + read the complete regset. + + max_size + + Maximum size in bytes that the regset can grow to for the target + thread. The regset won't grow bigger than this even if the target + thread changes its vector length etc. + + vl + + Target thread's current vector length, in bytes. + + max_vl + + Maximum possible vector length for the target thread. + + flags + + either + + SVE_PT_REGS_FPSIMD + + SVE registers are not live (GETREGSET) or are to be made + non-live (SETREGSET). + + The payload is of type struct user_fpsimd_state, with the same + meaning as for NT_PRFPREG, starting at offset + SVE_PT_FPSIMD_OFFSET from the start of user_sve_header. + + Extra data might be appended in the future: the size of the + payload should be obtained using SVE_PT_FPSIMD_SIZE(vq, flags). + + vq should be obtained using sve_vq_from_vl(vl). + + or + + SVE_PT_REGS_SVE + + SVE registers are live (GETREGSET) or are to be made live + (SETREGSET). + + The payload contains the SVE register data, starting at offset + SVE_PT_SVE_OFFSET from the start of user_sve_header, and with + size SVE_PT_SVE_SIZE(vq, flags); + + ... OR-ed with zero or more of the following flags, which have the same + meaning and behaviour as the corresponding PR_SET_VL_* flags: + + SVE_PT_VL_INHERIT + + SVE_PT_VL_ONEXEC (SETREGSET only). + +* The effects of changing the vector length and/or flags are equivalent to + those documented for PR_SVE_SET_VL. + + The caller must make a further GETREGSET call if it needs to know what VL is + actually set by SETREGSET, unless is it known in advance that the requested + VL is supported. + +* In the SVE_PT_REGS_SVE case, the size and layout of the payload depends on + the header fields. The SVE_PT_SVE_*() macros are provided to facilitate + access to the members. + +* In either case, for SETREGSET it is permissible to omit the payload, in which + case only the vector length and flags are changed (along with any + consequences of those changes). + +* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the + requested VL is not supported, the effect will be the same as if the + payload were omitted, except that an EIO error is reported. No + attempt is made to translate the payload data to the correct layout + for the vector length actually set. The thread's FPSIMD state is + preserved, but the remaining bits of the SVE registers become + unspecified. It is up to the caller to translate the payload layout + for the actual VL and retry. + +* The effect of writing a partial, incomplete payload is unspecified. + + +8. ELF coredump extensions +--------------------------- + +* A NT_ARM_SVE note will be added to each coredump for each thread of the + dumped process. The contents will be equivalent to the data that would have + been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread + when the coredump was generated. + + +9. System runtime configuration +-------------------------------- + +* To mitigate the ABI impact of expansion of the signal frame, a policy + mechanism is provided for administrators, distro maintainers and developers + to set the default vector length for userspace processes: + +/proc/sys/abi/sve_default_vector_length + + Writing the text representation of an integer to this file sets the system + default vector length to the specified value, unless the value is greater + than the maximum vector length supported by the system in which case the + default vector length is set to that maximum. + + The result can be determined by reopening the file and reading its + contents. + + At boot, the default vector length is initially set to 64 or the maximum + supported vector length, whichever is smaller. This determines the initial + vector length of the init process (PID 1). + + Reading this file returns the current system default vector length. + +* At every execve() call, the new vector length of the new process is set to + the system default vector length, unless + + * PR_SVE_SET_VL_INHERIT (or equivalently SVE_PT_VL_INHERIT) is set for the + calling thread, or + + * a deferred vector length change is pending, established via the + PR_SVE_SET_VL_ONEXEC flag (or SVE_PT_VL_ONEXEC). + +* Modifying the system default vector length does not affect the vector length + of any existing process or thread that does not make an execve() call. + + +Appendix A. SVE programmer's model (informative) +================================================= + +This section provides a minimal description of the additions made by SVE to the +ARMv8-A programmer's model that are relevant to this document. + +Note: This section is for information only and not intended to be complete or +to replace any architectural specification. + +A.1. Registers +--------------- + +In A64 state, SVE adds the following: + +* 32 8VL-bit vector registers Z0..Z31 + For each Zn, Zn bits [127:0] alias the ARMv8-A vector register Vn. + + A register write using a Vn register name zeros all bits of the corresponding + Zn except for bits [127:0]. + +* 16 VL-bit predicate registers P0..P15 + +* 1 VL-bit special-purpose predicate register FFR (the "first-fault register") + +* a VL "pseudo-register" that determines the size of each vector register + + The SVE instruction set architecture provides no way to write VL directly. + Instead, it can be modified only by EL1 and above, by writing appropriate + system registers. + +* The value of VL can be configured at runtime by EL1 and above: + 16 <= VL <= VLmax, where VL must be a multiple of 16. + +* The maximum vector length is determined by the hardware: + 16 <= VLmax <= 256. + + (The SVE architecture specifies 256, but permits future architecture + revisions to raise this limit.) + +* FPSR and FPCR are retained from ARMv8-A, and interact with SVE floating-point + operations in a similar way to the way in which they interact with ARMv8 + floating-point operations:: + + 8VL-1 128 0 bit index + +---- //// -----------------+ + Z0 | : V0 | + : : + Z7 | : V7 | + Z8 | : * V8 | + : : : + Z15 | : *V15 | + Z16 | : V16 | + : : + Z31 | : V31 | + +---- //// -----------------+ + 31 0 + VL-1 0 +-------+ + +---- //// --+ FPSR | | + P0 | | +-------+ + : | | *FPCR | | + P15 | | +-------+ + +---- //// --+ + FFR | | +-----+ + +---- //// --+ VL | | + +-----+ + +(*) callee-save: + This only applies to bits [63:0] of Z-/V-registers. + FPCR contains callee-save and caller-save bits. See [4] for details. + + +A.2. Procedure call standard +----------------------------- + +The ARMv8-A base procedure call standard is extended as follows with respect to +the additional SVE register state: + +* All SVE register bits that are not shared with FP/SIMD are caller-save. + +* Z8 bits [63:0] .. Z15 bits [63:0] are callee-save. + + This follows from the way these bits are mapped to V8..V15, which are caller- + save in the base procedure call standard. + + +Appendix B. ARMv8-A FP/SIMD programmer's model +=============================================== + +Note: This section is for information only and not intended to be complete or +to replace any architectural specification. + +Refer to [4] for for more information. + +ARMv8-A defines the following floating-point / SIMD register state: + +* 32 128-bit vector registers V0..V31 +* 2 32-bit status/control registers FPSR, FPCR + +:: + + 127 0 bit index + +---------------+ + V0 | | + : : : + V7 | | + * V8 | | + : : : : + *V15 | | + V16 | | + : : : + V31 | | + +---------------+ + + 31 0 + +-------+ + FPSR | | + +-------+ + *FPCR | | + +-------+ + +(*) callee-save: + This only applies to bits [63:0] of V-registers. + FPCR contains a mixture of callee-save and caller-save bits. + + +References +========== + +[1] arch/arm64/include/uapi/asm/sigcontext.h + AArch64 Linux signal ABI definitions + +[2] arch/arm64/include/uapi/asm/ptrace.h + AArch64 Linux ptrace ABI definitions + +[3] Documentation/arm64/cpu-feature-registers.rst + +[4] ARM IHI0055C + http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf + http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html + Procedure Call Standard for the ARM 64-bit Architecture (AArch64) diff --git a/Documentation/arm64/sve.txt b/Documentation/arm64/sve.txt deleted file mode 100644 index 9940e924a47e..000000000000 --- a/Documentation/arm64/sve.txt +++ /dev/null @@ -1,525 +0,0 @@ - Scalable Vector Extension support for AArch64 Linux - =================================================== - -Author: Dave Martin -Date: 4 August 2017 - -This document outlines briefly the interface provided to userspace by Linux in -order to support use of the ARM Scalable Vector Extension (SVE). - -This is an outline of the most important features and issues only and not -intended to be exhaustive. - -This document does not aim to describe the SVE architecture or programmer's -model. To aid understanding, a minimal description of relevant programmer's -model features for SVE is included in Appendix A. - - -1. General ------------ - -* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are - tracked per-thread. - -* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector - AT_HWCAP entry. Presence of this flag implies the presence of the SVE - instructions and registers, and the Linux-specific system interfaces - described in this document. SVE is reported in /proc/cpuinfo as "sve". - -* Support for the execution of SVE instructions in userspace can also be - detected by reading the CPU ID register ID_AA64PFR0_EL1 using an MRS - instruction, and checking that the value of the SVE field is nonzero. [3] - - It does not guarantee the presence of the system interfaces described in the - following sections: software that needs to verify that those interfaces are - present must check for HWCAP_SVE instead. - -* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also - be reported in the AT_HWCAP2 aux vector entry. In addition to this, - optional extensions to SVE2 may be reported by the presence of: - - HWCAP2_SVE2 - HWCAP2_SVEAES - HWCAP2_SVEPMULL - HWCAP2_SVEBITPERM - HWCAP2_SVESHA3 - HWCAP2_SVESM4 - - This list may be extended over time as the SVE architecture evolves. - - These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1, - which userspace can read using an MRS instruction. See elf_hwcaps.txt and - cpu-feature-registers.txt for details. - -* Debuggers should restrict themselves to interacting with the target via the - NT_ARM_SVE regset. The recommended way of detecting support for this regset - is to connect to a target process first and then attempt a - ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). - - -2. Vector length terminology ------------------------------ - -The size of an SVE vector (Z) register is referred to as the "vector length". - -To avoid confusion about the units used to express vector length, the kernel -adopts the following conventions: - -* Vector length (VL) = size of a Z-register in bytes - -* Vector quadwords (VQ) = size of a Z-register in units of 128 bits - -(So, VL = 16 * VQ.) - -The VQ convention is used where the underlying granularity is important, such -as in data structure definitions. In most other situations, the VL convention -is used. This is consistent with the meaning of the "VL" pseudo-register in -the SVE instruction set architecture. - - -3. System call behaviour -------------------------- - -* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of - Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR - become unspecified on return from a syscall. - -* The SVE registers are not used to pass arguments to or receive results from - any syscall. - -* In practice the affected registers/bits will be preserved or will be replaced - with zeros on return from a syscall, but userspace should not make - assumptions about this. The kernel behaviour may vary on a case-by-case - basis. - -* All other SVE state of a thread, including the currently configured vector - length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector - length (if any), is preserved across all syscalls, subject to the specific - exceptions for execve() described in section 6. - - In particular, on return from a fork() or clone(), the parent and new child - process or thread share identical SVE configuration, matching that of the - parent before the call. - - -4. Signal handling -------------------- - -* A new signal frame record sve_context encodes the SVE registers on signal - delivery. [1] - -* This record is supplementary to fpsimd_context. The FPSR and FPCR registers - are only present in fpsimd_context. For convenience, the content of V0..V31 - is duplicated between sve_context and fpsimd_context. - -* The signal frame record for SVE always contains basic metadata, in particular - the thread's vector length (in sve_context.vl). - -* The SVE registers may or may not be included in the record, depending on - whether the registers are live for the thread. The registers are present if - and only if: - sve_context.head.size >= SVE_SIG_CONTEXT_SIZE(sve_vq_from_vl(sve_context.vl)). - -* If the registers are present, the remainder of the record has a vl-dependent - size and layout. Macros SVE_SIG_* are defined [1] to facilitate access to - the members. - -* If the SVE context is too big to fit in sigcontext.__reserved[], then extra - space is allocated on the stack, an extra_context record is written in - __reserved[] referencing this space. sve_context is then written in the - extra space. Refer to [1] for further details about this mechanism. - - -5. Signal return ------------------ - -When returning from a signal handler: - -* If there is no sve_context record in the signal frame, or if the record is - present but contains no register data as desribed in the previous section, - then the SVE registers/bits become non-live and take unspecified values. - -* If sve_context is present in the signal frame and contains full register - data, the SVE registers become live and are populated with the specified - data. However, for backward compatibility reasons, bits [127:0] of Z0..Z31 - are always restored from the corresponding members of fpsimd_context.vregs[] - and not from sve_context. The remaining bits are restored from sve_context. - -* Inclusion of fpsimd_context in the signal frame remains mandatory, - irrespective of whether sve_context is present or not. - -* The vector length cannot be changed via signal return. If sve_context.vl in - the signal frame does not match the current vector length, the signal return - attempt is treated as illegal, resulting in a forced SIGSEGV. - - -6. prctl extensions --------------------- - -Some new prctl() calls are added to allow programs to manage the SVE vector -length: - -prctl(PR_SVE_SET_VL, unsigned long arg) - - Sets the vector length of the calling thread and related flags, where - arg == vl | flags. Other threads of the calling process are unaffected. - - vl is the desired vector length, where sve_vl_valid(vl) must be true. - - flags: - - PR_SVE_SET_VL_INHERIT - - Inherit the current vector length across execve(). Otherwise, the - vector length is reset to the system default at execve(). (See - Section 9.) - - PR_SVE_SET_VL_ONEXEC - - Defer the requested vector length change until the next execve() - performed by this thread. - - The effect is equivalent to implicit exceution of the following - call immediately after the next execve() (if any) by the thread: - - prctl(PR_SVE_SET_VL, arg & ~PR_SVE_SET_VL_ONEXEC) - - This allows launching of a new program with a different vector - length, while avoiding runtime side effects in the caller. - - - Without PR_SVE_SET_VL_ONEXEC, the requested change takes effect - immediately. - - - Return value: a nonnegative on success, or a negative value on error: - EINVAL: SVE not supported, invalid vector length requested, or - invalid flags. - - - On success: - - * Either the calling thread's vector length or the deferred vector length - to be applied at the next execve() by the thread (dependent on whether - PR_SVE_SET_VL_ONEXEC is present in arg), is set to the largest value - supported by the system that is less than or equal to vl. If vl == - SVE_VL_MAX, the value set will be the largest value supported by the - system. - - * Any previously outstanding deferred vector length change in the calling - thread is cancelled. - - * The returned value describes the resulting configuration, encoded as for - PR_SVE_GET_VL. The vector length reported in this value is the new - current vector length for this thread if PR_SVE_SET_VL_ONEXEC was not - present in arg; otherwise, the reported vector length is the deferred - vector length that will be applied at the next execve() by the calling - thread. - - * Changing the vector length causes all of P0..P15, FFR and all bits of - Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become - unspecified. Calling PR_SVE_SET_VL with vl equal to the thread's current - vector length, or calling PR_SVE_SET_VL with the PR_SVE_SET_VL_ONEXEC - flag, does not constitute a change to the vector length for this purpose. - - -prctl(PR_SVE_GET_VL) - - Gets the vector length of the calling thread. - - The following flag may be OR-ed into the result: - - PR_SVE_SET_VL_INHERIT - - Vector length will be inherited across execve(). - - There is no way to determine whether there is an outstanding deferred - vector length change (which would only normally be the case between a - fork() or vfork() and the corresponding execve() in typical use). - - To extract the vector length from the result, and it with - PR_SVE_VL_LEN_MASK. - - Return value: a nonnegative value on success, or a negative value on error: - EINVAL: SVE not supported. - - -7. ptrace extensions ---------------------- - -* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and - PTRACE_SETREGSET. - - Refer to [2] for definitions. - -The regset data starts with struct user_sve_header, containing: - - size - - Size of the complete regset, in bytes. - This depends on vl and possibly on other things in the future. - - If a call to PTRACE_GETREGSET requests less data than the value of - size, the caller can allocate a larger buffer and retry in order to - read the complete regset. - - max_size - - Maximum size in bytes that the regset can grow to for the target - thread. The regset won't grow bigger than this even if the target - thread changes its vector length etc. - - vl - - Target thread's current vector length, in bytes. - - max_vl - - Maximum possible vector length for the target thread. - - flags - - either - - SVE_PT_REGS_FPSIMD - - SVE registers are not live (GETREGSET) or are to be made - non-live (SETREGSET). - - The payload is of type struct user_fpsimd_state, with the same - meaning as for NT_PRFPREG, starting at offset - SVE_PT_FPSIMD_OFFSET from the start of user_sve_header. - - Extra data might be appended in the future: the size of the - payload should be obtained using SVE_PT_FPSIMD_SIZE(vq, flags). - - vq should be obtained using sve_vq_from_vl(vl). - - or - - SVE_PT_REGS_SVE - - SVE registers are live (GETREGSET) or are to be made live - (SETREGSET). - - The payload contains the SVE register data, starting at offset - SVE_PT_SVE_OFFSET from the start of user_sve_header, and with - size SVE_PT_SVE_SIZE(vq, flags); - - ... OR-ed with zero or more of the following flags, which have the same - meaning and behaviour as the corresponding PR_SET_VL_* flags: - - SVE_PT_VL_INHERIT - - SVE_PT_VL_ONEXEC (SETREGSET only). - -* The effects of changing the vector length and/or flags are equivalent to - those documented for PR_SVE_SET_VL. - - The caller must make a further GETREGSET call if it needs to know what VL is - actually set by SETREGSET, unless is it known in advance that the requested - VL is supported. - -* In the SVE_PT_REGS_SVE case, the size and layout of the payload depends on - the header fields. The SVE_PT_SVE_*() macros are provided to facilitate - access to the members. - -* In either case, for SETREGSET it is permissible to omit the payload, in which - case only the vector length and flags are changed (along with any - consequences of those changes). - -* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the - requested VL is not supported, the effect will be the same as if the - payload were omitted, except that an EIO error is reported. No - attempt is made to translate the payload data to the correct layout - for the vector length actually set. The thread's FPSIMD state is - preserved, but the remaining bits of the SVE registers become - unspecified. It is up to the caller to translate the payload layout - for the actual VL and retry. - -* The effect of writing a partial, incomplete payload is unspecified. - - -8. ELF coredump extensions ---------------------------- - -* A NT_ARM_SVE note will be added to each coredump for each thread of the - dumped process. The contents will be equivalent to the data that would have - been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread - when the coredump was generated. - - -9. System runtime configuration --------------------------------- - -* To mitigate the ABI impact of expansion of the signal frame, a policy - mechanism is provided for administrators, distro maintainers and developers - to set the default vector length for userspace processes: - -/proc/sys/abi/sve_default_vector_length - - Writing the text representation of an integer to this file sets the system - default vector length to the specified value, unless the value is greater - than the maximum vector length supported by the system in which case the - default vector length is set to that maximum. - - The result can be determined by reopening the file and reading its - contents. - - At boot, the default vector length is initially set to 64 or the maximum - supported vector length, whichever is smaller. This determines the initial - vector length of the init process (PID 1). - - Reading this file returns the current system default vector length. - -* At every execve() call, the new vector length of the new process is set to - the system default vector length, unless - - * PR_SVE_SET_VL_INHERIT (or equivalently SVE_PT_VL_INHERIT) is set for the - calling thread, or - - * a deferred vector length change is pending, established via the - PR_SVE_SET_VL_ONEXEC flag (or SVE_PT_VL_ONEXEC). - -* Modifying the system default vector length does not affect the vector length - of any existing process or thread that does not make an execve() call. - - -Appendix A. SVE programmer's model (informative) -================================================= - -This section provides a minimal description of the additions made by SVE to the -ARMv8-A programmer's model that are relevant to this document. - -Note: This section is for information only and not intended to be complete or -to replace any architectural specification. - -A.1. Registers ---------------- - -In A64 state, SVE adds the following: - -* 32 8VL-bit vector registers Z0..Z31 - For each Zn, Zn bits [127:0] alias the ARMv8-A vector register Vn. - - A register write using a Vn register name zeros all bits of the corresponding - Zn except for bits [127:0]. - -* 16 VL-bit predicate registers P0..P15 - -* 1 VL-bit special-purpose predicate register FFR (the "first-fault register") - -* a VL "pseudo-register" that determines the size of each vector register - - The SVE instruction set architecture provides no way to write VL directly. - Instead, it can be modified only by EL1 and above, by writing appropriate - system registers. - -* The value of VL can be configured at runtime by EL1 and above: - 16 <= VL <= VLmax, where VL must be a multiple of 16. - -* The maximum vector length is determined by the hardware: - 16 <= VLmax <= 256. - - (The SVE architecture specifies 256, but permits future architecture - revisions to raise this limit.) - -* FPSR and FPCR are retained from ARMv8-A, and interact with SVE floating-point - operations in a similar way to the way in which they interact with ARMv8 - floating-point operations. - - 8VL-1 128 0 bit index - +---- //// -----------------+ - Z0 | : V0 | - : : - Z7 | : V7 | - Z8 | : * V8 | - : : : - Z15 | : *V15 | - Z16 | : V16 | - : : - Z31 | : V31 | - +---- //// -----------------+ - 31 0 - VL-1 0 +-------+ - +---- //// --+ FPSR | | - P0 | | +-------+ - : | | *FPCR | | - P15 | | +-------+ - +---- //// --+ - FFR | | +-----+ - +---- //// --+ VL | | - +-----+ - -(*) callee-save: - This only applies to bits [63:0] of Z-/V-registers. - FPCR contains callee-save and caller-save bits. See [4] for details. - - -A.2. Procedure call standard ------------------------------ - -The ARMv8-A base procedure call standard is extended as follows with respect to -the additional SVE register state: - -* All SVE register bits that are not shared with FP/SIMD are caller-save. - -* Z8 bits [63:0] .. Z15 bits [63:0] are callee-save. - - This follows from the way these bits are mapped to V8..V15, which are caller- - save in the base procedure call standard. - - -Appendix B. ARMv8-A FP/SIMD programmer's model -=============================================== - -Note: This section is for information only and not intended to be complete or -to replace any architectural specification. - -Refer to [4] for for more information. - -ARMv8-A defines the following floating-point / SIMD register state: - -* 32 128-bit vector registers V0..V31 -* 2 32-bit status/control registers FPSR, FPCR - - 127 0 bit index - +---------------+ - V0 | | - : : : - V7 | | - * V8 | | - : : : : - *V15 | | - V16 | | - : : : - V31 | | - +---------------+ - - 31 0 - +-------+ - FPSR | | - +-------+ - *FPCR | | - +-------+ - -(*) callee-save: - This only applies to bits [63:0] of V-registers. - FPCR contains a mixture of callee-save and caller-save bits. - - -References -========== - -[1] arch/arm64/include/uapi/asm/sigcontext.h - AArch64 Linux signal ABI definitions - -[2] arch/arm64/include/uapi/asm/ptrace.h - AArch64 Linux ptrace ABI definitions - -[3] Documentation/arm64/cpu-feature-registers.txt - -[4] ARM IHI0055C - http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf - http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html - Procedure Call Standard for the ARM 64-bit Architecture (AArch64) diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst new file mode 100644 index 000000000000..2acdec3ebbeb --- /dev/null +++ b/Documentation/arm64/tagged-pointers.rst @@ -0,0 +1,68 @@ +========================================= +Tagged virtual addresses in AArch64 Linux +========================================= + +Author: Will Deacon + +Date : 12 June 2013 + +This document briefly describes the provision of tagged virtual +addresses in the AArch64 translation system and their potential uses +in AArch64 Linux. + +The kernel configures the translation tables so that translations made +via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of +the virtual address ignored by the translation hardware. This frees up +this byte for application use. + + +Passing tagged addresses to the kernel +-------------------------------------- + +All interpretation of userspace memory addresses by the kernel assumes +an address tag of 0x00. + +This includes, but is not limited to, addresses found in: + + - pointer arguments to system calls, including pointers in structures + passed to system calls, + + - the stack pointer (sp), e.g. when interpreting it to deliver a + signal, + + - the frame pointer (x29) and frame records, e.g. when interpreting + them to generate a backtrace or call graph. + +Using non-zero address tags in any of these locations may result in an +error code being returned, a (fatal) signal being raised, or other modes +of failure. + +For these reasons, passing non-zero address tags to the kernel via +system calls is forbidden, and using a non-zero address tag for sp is +strongly discouraged. + +Programs maintaining a frame pointer and frame records that use non-zero +address tags may suffer impaired or inaccurate debug and profiling +visibility. + + +Preserving tags +--------------- + +Non-zero tags are not preserved when delivering signals. This means that +signal handlers in applications making use of tags cannot rely on the +tag information for user virtual addresses being maintained for fields +inside siginfo_t. One exception to this rule is for signals raised in +response to watchpoint debug exceptions, where the tag information will +be preserved. + +The architecture prevents the use of a tagged PC, so the upper byte will +be set to a sign-extension of bit 55 on exception return. + + +Other considerations +-------------------- + +Special care should be taken when using tagged pointers, since it is +likely that C compilers will not hazard two virtual addresses differing +only in the upper byte. diff --git a/Documentation/arm64/tagged-pointers.txt b/Documentation/arm64/tagged-pointers.txt deleted file mode 100644 index a25a99e82bb1..000000000000 --- a/Documentation/arm64/tagged-pointers.txt +++ /dev/null @@ -1,66 +0,0 @@ - Tagged virtual addresses in AArch64 Linux - ========================================= - -Author: Will Deacon -Date : 12 June 2013 - -This document briefly describes the provision of tagged virtual -addresses in the AArch64 translation system and their potential uses -in AArch64 Linux. - -The kernel configures the translation tables so that translations made -via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of -the virtual address ignored by the translation hardware. This frees up -this byte for application use. - - -Passing tagged addresses to the kernel --------------------------------------- - -All interpretation of userspace memory addresses by the kernel assumes -an address tag of 0x00. - -This includes, but is not limited to, addresses found in: - - - pointer arguments to system calls, including pointers in structures - passed to system calls, - - - the stack pointer (sp), e.g. when interpreting it to deliver a - signal, - - - the frame pointer (x29) and frame records, e.g. when interpreting - them to generate a backtrace or call graph. - -Using non-zero address tags in any of these locations may result in an -error code being returned, a (fatal) signal being raised, or other modes -of failure. - -For these reasons, passing non-zero address tags to the kernel via -system calls is forbidden, and using a non-zero address tag for sp is -strongly discouraged. - -Programs maintaining a frame pointer and frame records that use non-zero -address tags may suffer impaired or inaccurate debug and profiling -visibility. - - -Preserving tags ---------------- - -Non-zero tags are not preserved when delivering signals. This means that -signal handlers in applications making use of tags cannot rely on the -tag information for user virtual addresses being maintained for fields -inside siginfo_t. One exception to this rule is for signals raised in -response to watchpoint debug exceptions, where the tag information will -be preserved. - -The architecture prevents the use of a tagged PC, so the upper byte will -be set to a sign-extension of bit 55 on exception return. - - -Other considerations --------------------- - -Special care should be taken when using tagged pointers, since it is -likely that C compilers will not hazard two virtual addresses differing -only in the upper byte. diff --git a/Documentation/translations/zh_CN/arm64/booting.txt b/Documentation/translations/zh_CN/arm64/booting.txt index c1dd968c5ee9..3bfbf66e5a5e 100644 --- a/Documentation/translations/zh_CN/arm64/booting.txt +++ b/Documentation/translations/zh_CN/arm64/booting.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/booting.txt +Chinese translated version of Documentation/arm64/booting.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ M: Will Deacon zh_CN: Fu Wei C: 55f058e7574c3615dea4615573a19bdb258696c6 --------------------------------------------------------------------- -Documentation/arm64/booting.txt 的中文翻译 +Documentation/arm64/booting.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/legacy_instructions.txt b/Documentation/translations/zh_CN/arm64/legacy_instructions.txt index 68362a1ab717..e295cf75f606 100644 --- a/Documentation/translations/zh_CN/arm64/legacy_instructions.txt +++ b/Documentation/translations/zh_CN/arm64/legacy_instructions.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/legacy_instructions.txt +Chinese translated version of Documentation/arm64/legacy_instructions.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Punit Agrawal Suzuki K. Poulose Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/arm64/legacy_instructions.txt 的中文翻译 +Documentation/arm64/legacy_instructions.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/memory.txt b/Documentation/translations/zh_CN/arm64/memory.txt index 19b3a52d5d94..be20f8228b91 100644 --- a/Documentation/translations/zh_CN/arm64/memory.txt +++ b/Documentation/translations/zh_CN/arm64/memory.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/memory.txt +Chinese translated version of Documentation/arm64/memory.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -9,7 +9,7 @@ or if there is a problem with the translation. Maintainer: Catalin Marinas Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/arm64/memory.txt 的中文翻译 +Documentation/arm64/memory.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/silicon-errata.txt b/Documentation/translations/zh_CN/arm64/silicon-errata.txt index 39477c75c4a4..440c59ac7dce 100644 --- a/Documentation/translations/zh_CN/arm64/silicon-errata.txt +++ b/Documentation/translations/zh_CN/arm64/silicon-errata.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/silicon-errata.txt +Chinese translated version of Documentation/arm64/silicon-errata.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ M: Will Deacon zh_CN: Fu Wei C: 1926e54f115725a9248d0c4c65c22acaf94de4c4 --------------------------------------------------------------------- -Documentation/arm64/silicon-errata.txt 的中文翻译 +Documentation/arm64/silicon-errata.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/tagged-pointers.txt b/Documentation/translations/zh_CN/arm64/tagged-pointers.txt index 2664d1bd5a1c..77ac3548a16d 100644 --- a/Documentation/translations/zh_CN/arm64/tagged-pointers.txt +++ b/Documentation/translations/zh_CN/arm64/tagged-pointers.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/tagged-pointers.txt +Chinese translated version of Documentation/arm64/tagged-pointers.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -9,7 +9,7 @@ or if there is a problem with the translation. Maintainer: Will Deacon Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/arm64/tagged-pointers.txt 的中文翻译 +Documentation/arm64/tagged-pointers.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ba6c42c576dd..68984c284c40 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2205,7 +2205,7 @@ max_vq. This is the maximum vector length available to the guest on this vcpu, and determines which register slices are visible through this ioctl interface. -(See Documentation/arm64/sve.txt for an explanation of the "vq" +(See Documentation/arm64/sve.rst for an explanation of the "vq" nomenclature.) KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. -- cgit From e327cfcb25422c91f4bb8e8a3488386ac95955f1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:39 -0300 Subject: docs: cdrom-standard.tex: convert from LaTeX to ReST This is the only LaTeX documentation file inside the documentation. Instead of having a Latex document directly there, convert it to ReST format, as this is the format we're using for docs. For now, let's keep the extension as .txt in order to avoid warnings when building the documentation with Sphinx. The next patch patch will rename it to .rst and add it to the building system. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/cdrom/Makefile | 21 - Documentation/cdrom/cdrom-standard.tex | 1026 ------------------------------ Documentation/cdrom/cdrom-standard.txt | 1063 ++++++++++++++++++++++++++++++++ 3 files changed, 1063 insertions(+), 1047 deletions(-) delete mode 100644 Documentation/cdrom/Makefile delete mode 100644 Documentation/cdrom/cdrom-standard.tex create mode 100644 Documentation/cdrom/cdrom-standard.txt (limited to 'Documentation') diff --git a/Documentation/cdrom/Makefile b/Documentation/cdrom/Makefile deleted file mode 100644 index a19e321928e1..000000000000 --- a/Documentation/cdrom/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -LATEXFILE = cdrom-standard - -all: - make clean - latex $(LATEXFILE) - latex $(LATEXFILE) - @if [ -x `which gv` ]; then \ - `dvips -q -t letter -o $(LATEXFILE).ps $(LATEXFILE).dvi` ;\ - `gv -antialias -media letter -nocenter $(LATEXFILE).ps` ;\ - else \ - `xdvi $(LATEXFILE).dvi &` ;\ - fi - make sortofclean - -clean: - rm -f $(LATEXFILE).ps $(LATEXFILE).dvi $(LATEXFILE).aux $(LATEXFILE).log - -sortofclean: - rm -f $(LATEXFILE).aux $(LATEXFILE).log - - diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex deleted file mode 100644 index f7cd455973f7..000000000000 --- a/Documentation/cdrom/cdrom-standard.tex +++ /dev/null @@ -1,1026 +0,0 @@ -\documentclass{article} -\def\version{$Id: cdrom-standard.tex,v 1.9 1997/12/28 15:42:49 david Exp $} -\newcommand{\newsection}[1]{\newpage\section{#1}} - -\evensidemargin=0pt -\oddsidemargin=0pt -\topmargin=-\headheight \advance\topmargin by -\headsep -\textwidth=15.99cm \textheight=24.62cm % normal A4, 1'' margin - -\def\linux{{\sc Linux}} -\def\cdrom{{\sc cd-rom}} -\def\UCD{{\sc Uniform cd-rom Driver}} -\def\cdromc{{\tt {cdrom.c}}} -\def\cdromh{{\tt {cdrom.h}}} -\def\fo{\sl} % foreign words -\def\ie{{\fo i.e.}} -\def\eg{{\fo e.g.}} - -\everymath{\it} \everydisplay{\it} -\catcode `\_=\active \def_{\_\penalty100 } -\catcode`\<=\active \def<#1>{{\langle\hbox{\rm#1}\rangle}} - -\begin{document} -\title{A \linux\ \cdrom\ standard} -\author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl} -\\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}} -\\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}} -\date{12 March 1999} - -\maketitle - -\newsection{Introduction} - -\linux\ is probably the Unix-like operating system that supports -the widest variety of hardware devices. The reasons for this are -presumably -\begin{itemize} -\item - The large list of hardware devices available for the many platforms - that \linux\ now supports (\ie, i386-PCs, Sparc Suns, etc.) -\item - The open design of the operating system, such that anybody can write a - driver for \linux. -\item - There is plenty of source code around as examples of how to write a driver. -\end{itemize} -The openness of \linux, and the many different types of available -hardware has allowed \linux\ to support many different hardware devices. -Unfortunately, the very openness that has allowed \linux\ to support -all these different devices has also allowed the behavior of each -device driver to differ significantly from one device to another. -This divergence of behavior has been very significant for \cdrom\ -devices; the way a particular drive reacts to a `standard' $ioctl()$ -call varies greatly from one device driver to another. To avoid making -their drivers totally inconsistent, the writers of \linux\ \cdrom\ -drivers generally created new device drivers by understanding, copying, -and then changing an existing one. Unfortunately, this practice did not -maintain uniform behavior across all the \linux\ \cdrom\ drivers. - -This document describes an effort to establish Uniform behavior across -all the different \cdrom\ device drivers for \linux. This document also -defines the various $ioctl$s, and how the low-level \cdrom\ device -drivers should implement them. Currently (as of the \linux\ 2.1.$x$ -development kernels) several low-level \cdrom\ device drivers, including -both IDE/ATAPI and SCSI, now use this Uniform interface. - -When the \cdrom\ was developed, the interface between the \cdrom\ drive -and the computer was not specified in the standards. As a result, many -different \cdrom\ interfaces were developed. Some of them had their -own proprietary design (Sony, Mitsumi, Panasonic, Philips), other -manufacturers adopted an existing electrical interface and changed -the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply -adapted their drives to one or more of the already existing electrical -interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and -most of the `NoName' manufacturers). In cases where a new drive really -brought its own interface or used its own command set and flow control -scheme, either a separate driver had to be written, or an existing -driver had to be enhanced. History has delivered us \cdrom\ support for -many of these different interfaces. Nowadays, almost all new \cdrom\ -drives are either IDE/ATAPI or SCSI, and it is very unlikely that any -manufacturer will create a new interface. Even finding drives for the -old proprietary interfaces is getting difficult. - -When (in the 1.3.70's) I looked at the existing software interface, -which was expressed through \cdromh, it appeared to be a rather wild -set of commands and data formats.\footnote{I cannot recollect what -kernel version I looked at, then, presumably 1.2.13 and 1.3.34---the -latest kernel that I was indirectly involved in.} It seemed that many -features of the software interface had been added to accommodate the -capabilities of a particular drive, in an {\fo ad hoc\/} manner. More -importantly, it appeared that the behavior of the `standard' commands -was different for most of the different drivers: \eg, some drivers -close the tray if an $open()$ call occurs when the tray is open, while -others do not. Some drivers lock the door upon opening the device, to -prevent an incoherent file system, but others don't, to allow software -ejection. Undoubtedly, the capabilities of the different drives vary, -but even when two drives have the same capability their drivers' -behavior was usually different. - -I decided to start a discussion on how to make all the \linux\ \cdrom\ -drivers behave more uniformly. I began by contacting the developers of -the many \cdrom\ drivers found in the \linux\ kernel. Their reactions -encouraged me to write the \UCD\ which this document is intended to -describe. The implementation of the \UCD\ is in the file \cdromc. This -driver is intended to be an additional software layer that sits on top -of the low-level device drivers for each \cdrom\ drive. By adding this -additional layer, it is possible to have all the different \cdrom\ -devices behave {\em exactly\/} the same (insofar as the underlying -hardware will allow). - -The goal of the \UCD\ is {\em not\/} to alienate driver developers who -have not yet taken steps to support this effort. The goal of \UCD\ is -simply to give people writing application programs for \cdrom\ drives -{\em one\/} \linux\ \cdrom\ interface with consistent behavior for all -\cdrom\ devices. In addition, this also provides a consistent interface -between the low-level device driver code and the \linux\ kernel. Care -is taken that 100\,\% compatibility exists with the data structures and -programmer's interface defined in \cdromh. This guide was written to -help \cdrom\ driver developers adapt their code to use the \UCD\ code -defined in \cdromc. - -Personally, I think that the most important hardware interfaces are -the IDE/ATAPI drives and, of course, the SCSI drives, but as prices -of hardware drop continuously, it is also likely that people may have -more than one \cdrom\ drive, possibly of mixed types. It is important -that these drives behave in the same way. In December 1994, one of the -cheapest \cdrom\ drives was a Philips cm206, a double-speed proprietary -drive. In the months that I was busy writing a \linux\ driver for it, -proprietary drives became obsolete and IDE/ATAPI drives became the -standard. At the time of the last update to this document (November -1997) it is becoming difficult to even {\em find} anything less than a -16 speed \cdrom\ drive, and 24 speed drives are common. - -\newsection{Standardizing through another software level} -\label{cdrom.c} - -At the time this document was conceived, all drivers directly -implemented the \cdrom\ $ioctl()$ calls through their own routines. This -led to the danger of different drivers forgetting to do important things -like checking that the user was giving the driver valid data. More -importantly, this led to the divergence of behavior, which has already -been discussed. - -For this reason, the \UCD\ was created to enforce consistent \cdrom\ -drive behavior, and to provide a common set of services to the various -low-level \cdrom\ device drivers. The \UCD\ now provides another -software-level, that separates the $ioctl()$ and $open()$ implementation -from the actual hardware implementation. Note that this effort has -made few changes which will affect a user's application programs. The -greatest change involved moving the contents of the various low-level -\cdrom\ drivers' header files to the kernel's cdrom directory. This was -done to help ensure that the user is only presented with only one cdrom -interface, the interface defined in \cdromh. - -\cdrom\ drives are specific enough (\ie, different from other -block-devices such as floppy or hard disc drives), to define a set -of common {\em \cdrom\ device operations}, $_dops$. -These operations are different from the classical block-device file -operations, $_fops$. - -The routines for the \UCD\ interface level are implemented in the file -\cdromc. In this file, the \UCD\ interfaces with the kernel as a block -device by registering the following general $struct\ file_operations$: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -struct& file_operations\ cdrom_fops = \{\hidewidth\cr - &NULL, & lseek \cr - &block_read, & read---general block-dev read \cr - &block_write, & write---general block-dev write \cr - &NULL, & readdir \cr - &NULL, & select \cr - &cdrom_ioctl, & ioctl \cr - &NULL, & mmap \cr - &cdrom_open, & open \cr - &cdrom_release, & release \cr - &NULL, & fsync \cr - &NULL, & fasync \cr - &cdrom_media_changed, & media change \cr - &NULL & revalidate \cr -\};\cr -} -$$ - -Every active \cdrom\ device shares this $struct$. The routines -declared above are all implemented in \cdromc, since this file is the -place where the behavior of all \cdrom-devices is defined and -standardized. The actual interface to the various types of \cdrom\ -hardware is still performed by various low-level \cdrom-device -drivers. These routines simply implement certain {\em capabilities\/} -that are common to all \cdrom\ (and really, all removable-media -devices). - -Registration of a low-level \cdrom\ device driver is now done through -the general routines in \cdromc, not through the Virtual File System -(VFS) any more. The interface implemented in \cdromc\ is carried out -through two general structures that contain information about the -capabilities of the driver, and the specific drives on which the -driver operates. The structures are: -\begin{description} -\item[$cdrom_device_ops$] - This structure contains information about the low-level driver for a - \cdrom\ device. This structure is conceptually connected to the major - number of the device (although some drivers may have different - major numbers, as is the case for the IDE driver). -\item[$cdrom_device_info$] - This structure contains information about a particular \cdrom\ drive, - such as its device name, speed, etc. This structure is conceptually - connected to the minor number of the device. -\end{description} - -Registering a particular \cdrom\ drive with the \UCD\ is done by the -low-level device driver though a call to: -$$register_cdrom(struct\ cdrom_device_info * _info) -$$ -The device information structure, $_info$, contains all the -information needed for the kernel to interface with the low-level -\cdrom\ device driver. One of the most important entries in this -structure is a pointer to the $cdrom_device_ops$ structure of the -low-level driver. - -The device operations structure, $cdrom_device_ops$, contains a list -of pointers to the functions which are implemented in the low-level -device driver. When \cdromc\ accesses a \cdrom\ device, it does it -through the functions in this structure. It is impossible to know all -the capabilities of future \cdrom\ drives, so it is expected that this -list may need to be expanded from time to time as new technologies are -developed. For example, CD-R and CD-R/W drives are beginning to become -popular, and support will soon need to be added for them. For now, the -current $struct$ is: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}& - $/*$ \rm# $*/$\hfil\cr -struct& cdrom_device_ops\ \{ \hidewidth\cr - &int& (* open)(struct\ cdrom_device_info *, int)\cr - &void& (* release)(struct\ cdrom_device_info *);\cr - &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr - &unsigned\ int& (* check_events)(struct\ cdrom_device_info *, unsigned\ int, int);\cr - &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr - &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr - &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr - &int& (* select_speed)(struct\ cdrom_device_info *, int);\cr - &int& (* select_disc)(struct\ cdrom_device_info *, int);\cr - &int& (* get_last_session) (struct\ cdrom_device_info *, - struct\ cdrom_multisession *{});\cr - &int& (* get_mcn)(struct\ cdrom_device_info *, struct\ cdrom_mcn *{});\cr - &int& (* reset)(struct\ cdrom_device_info *);\cr - &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int, - void *{});\cr -\noalign{\medskip} - &const\ int& capability;& capability flags \cr - &int& (* generic_packet)(struct\ cdrom_device_info *, struct\ packet_command *{});\cr -\};\cr -} -$$ -When a low-level device driver implements one of these capabilities, -it should add a function pointer to this $struct$. When a particular -function is not implemented, however, this $struct$ should contain a -NULL instead. The $capability$ flags specify the capabilities of the -\cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive -is registered with the \UCD. - -Note that most functions have fewer parameters than their -$blkdev_fops$ counterparts. This is because very little of the -information in the structures $inode$ and $file$ is used. For most -drivers, the main parameter is the $struct$ $cdrom_device_info$, from -which the major and minor number can be extracted. (Most low-level -\cdrom\ drivers don't even look at the major and minor number though, -since many of them only support one device.) This will be available -through $dev$ in $cdrom_device_info$ described below. - -The drive-specific, minor-like information that is registered with -\cdromc, currently contains the following fields: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}& - $/*$ \rm# $*/$\hfil\cr -struct& cdrom_device_info\ \{ \hidewidth\cr - & const\ struct\ cdrom_device_ops *& ops;& device operations for this major\cr - & struct\ list_head& list;& linked list of all device_info\cr - & struct\ gendisk *& disk;& matching block layer disk\cr - & void *& handle;& driver-dependent data\cr -\noalign{\medskip} - & int& mask;& mask of capability: disables them \cr - & int& speed;& maximum speed for reading data \cr - & int& capacity;& number of discs in a jukebox \cr -\noalign{\medskip} - &unsigned\ int& options : 30;& options flags \cr - &unsigned& mc_flags : 2;& media-change buffer flags \cr - &unsigned\ int& vfs_events;& cached events for vfs path\cr - &unsigned\ int& ioctl_events;& cached events for ioctl path\cr - & int& use_count;& number of times device is opened\cr - & char& name[20];& name of the device type\cr -\noalign{\medskip} - &__u8& sanyo_slot : 2;& Sanyo 3-CD changer support\cr - &__u8& keeplocked : 1;& CDROM_LOCKDOOR status\cr - &__u8& reserved : 5;& not used yet\cr - & int& cdda_method;& see CDDA_* flags\cr - &__u8& last_sense;& saves last sense key\cr - &__u8& media_written;& dirty flag, DVD+RW bookkeeping\cr - &unsigned\ short& mmc3_profile;& current MMC3 profile\cr - & int& for_data;& unknown:TBD\cr - & int\ (* exit)\ (struct\ cdrom_device_info *);&& unknown:TBD\cr - & int& mrw_mode_page;& which MRW mode page is in use\cr -\}\cr -}$$ -Using this $struct$, a linked list of the registered minor devices is -built, using the $next$ field. The device number, the device operations -struct and specifications of properties of the drive are stored in this -structure. - -The $mask$ flags can be used to mask out some of the capabilities listed -in $ops\to capability$, if a specific drive doesn't support a feature -of the driver. The value $speed$ specifies the maximum head-rate of the -drive, measured in units of normal audio speed (176\,kB/sec raw data or -150\,kB/sec file system data). The parameters are declared $const$ -because they describe properties of the drive, which don't change after -registration. - -A few registers contain variables local to the \cdrom\ drive. The -flags $options$ are used to specify how the general \cdrom\ routines -should behave. These various flags registers should provide enough -flexibility to adapt to the different users' wishes (and {\em not\/} the -`arbitrary' wishes of the author of the low-level device driver, as is -the case in the old scheme). The register $mc_flags$ is used to buffer -the information from $media_changed()$ to two separate queues. Other -data that is specific to a minor drive, can be accessed through $handle$, -which can point to a data structure specific to the low-level driver. -The fields $use_count$, $next$, $options$ and $mc_flags$ need not be -initialized. - -The intermediate software layer that \cdromc\ forms will perform some -additional bookkeeping. The use count of the device (the number of -processes that have the device opened) is registered in $use_count$. The -function $cdrom_ioctl()$ will verify the appropriate user-memory regions -for read and write, and in case a location on the CD is transferred, -it will `sanitize' the format by making requests to the low-level -drivers in a standard format, and translating all formats between the -user-software and low level drivers. This relieves much of the drivers' -memory checking and format checking and translation. Also, the necessary -structures will be declared on the program stack. - -The implementation of the functions should be as defined in the -following sections. Two functions {\em must\/} be implemented, namely -$open()$ and $release()$. Other functions may be omitted, their -corresponding capability flags will be cleared upon registration. -Generally, a function returns zero on success and negative on error. A -function call should return only after the command has completed, but of -course waiting for the device should not use processor time. - -\subsection{$Int\ open(struct\ cdrom_device_info * cdi, int\ purpose)$} - -$Open()$ should try to open the device for a specific $purpose$, which -can be either: -\begin{itemize} -\item[0] Open for reading data, as done by {\tt {mount()}} (2), or the -user commands {\tt {dd}} or {\tt {cat}}. -\item[1] Open for $ioctl$ commands, as done by audio-CD playing -programs. -\end{itemize} -Notice that any strategic code (closing tray upon $open()$, etc.)\ is -done by the calling routine in \cdromc, so the low-level routine -should only be concerned with proper initialization, such as spinning -up the disc, etc. % and device-use count - - -\subsection{$Void\ release(struct\ cdrom_device_info * cdi)$} - - -Device-specific actions should be taken such as spinning down the device. -However, strategic actions such as ejection of the tray, or unlocking -the door, should be left over to the general routine $cdrom_release()$. -This is the only function returning type $void$. - -\subsection{$Int\ drive_status(struct\ cdrom_device_info * cdi, int\ slot_nr)$} -\label{drive status} - -The function $drive_status$, if implemented, should provide -information on the status of the drive (not the status of the disc, -which may or may not be in the drive). If the drive is not a changer, -$slot_nr$ should be ignored. In \cdromh\ the possibilities are listed: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDS_NO_INFO& no information available\cr -CDS_NO_DISC& no disc is inserted, tray is closed\cr -CDS_TRAY_OPEN& tray is opened\cr -CDS_DRIVE_NOT_READY& something is wrong, tray is moving?\cr -CDS_DISC_OK& a disc is loaded and everything is fine\cr -} -$$ - -\subsection{$Int\ media_changed(struct\ cdrom_device_info * cdi, int\ disc_nr)$} - -This function is very similar to the original function in $struct\ -file_operations$. It returns 1 if the medium of the device $cdi\to -dev$ has changed since the last call, and 0 otherwise. The parameter -$disc_nr$ identifies a specific slot in a juke-box, it should be -ignored for single-disc drives. Note that by `re-routing' this -function through $cdrom_media_changed()$, we can implement separate -queues for the VFS and a new $ioctl()$ function that can report device -changes to software (\eg, an auto-mounting daemon). - -\subsection{$Int\ tray_move(struct\ cdrom_device_info * cdi, int\ position)$} - -This function, if implemented, should control the tray movement. (No -other function should control this.) The parameter $position$ controls -the desired direction of movement: -\begin{itemize} -\item[0] Close tray -\item[1] Open tray -\end{itemize} -This function returns 0 upon success, and a non-zero value upon -error. Note that if the tray is already in the desired position, no -action need be taken, and the return value should be 0. - -\subsection{$Int\ lock_door(struct\ cdrom_device_info * cdi, int\ lock)$} - -This function (and no other code) controls locking of the door, if the -drive allows this. The value of $lock$ controls the desired locking -state: -\begin{itemize} -\item[0] Unlock door, manual opening is allowed -\item[1] Lock door, tray cannot be ejected manually -\end{itemize} -This function returns 0 upon success, and a non-zero value upon -error. Note that if the door is already in the requested state, no -action need be taken, and the return value should be 0. - -\subsection{$Int\ select_speed(struct\ cdrom_device_info * cdi, int\ speed)$} - -Some \cdrom\ drives are capable of changing their head-speed. There -are several reasons for changing the speed of a \cdrom\ drive. Badly -pressed \cdrom s may benefit from less-than-maximum head rate. Modern -\cdrom\ drives can obtain very high head rates (up to $24\times$ is -common). It has been reported that these drives can make reading -errors at these high speeds, reducing the speed can prevent data loss -in these circumstances. Finally, some of these drives can -make an annoyingly loud noise, which a lower speed may reduce. %Finally, -%although the audio-low-pass filters probably aren't designed for it, -%more than real-time playback of audio might be used for high-speed -%copying of audio tracks. - -This function specifies the speed at which data is read or audio is -played back. The value of $speed$ specifies the head-speed of the -drive, measured in units of standard cdrom speed (176\,kB/sec raw data -or 150\,kB/sec file system data). So to request that a \cdrom\ drive -operate at 300\,kB/sec you would call the CDROM_SELECT_SPEED $ioctl$ -with $speed=2$. The special value `0' means `auto-selection', \ie, -maximum data-rate or real-time audio rate. If the drive doesn't have -this `auto-selection' capability, the decision should be made on the -current disc loaded and the return value should be positive. A negative -return value indicates an error. - -\subsection{$Int\ select_disc(struct\ cdrom_device_info * cdi, int\ number)$} - -If the drive can store multiple discs (a juke-box) this function -will perform disc selection. It should return the number of the -selected disc on success, a negative value on error. Currently, only -the ide-cd driver supports this functionality. - -\subsection{$Int\ get_last_session(struct\ cdrom_device_info * cdi, struct\ - cdrom_multisession * ms_info)$} - -This function should implement the old corresponding $ioctl()$. For -device $cdi\to dev$, the start of the last session of the current disc -should be returned in the pointer argument $ms_info$. Note that -routines in \cdromc\ have sanitized this argument: its requested -format will {\em always\/} be of the type $CDROM_LBA$ (linear block -addressing mode), whatever the calling software requested. But -sanitization goes even further: the low-level implementation may -return the requested information in $CDROM_MSF$ format if it wishes so -(setting the $ms_info\rightarrow addr_format$ field appropriately, of -course) and the routines in \cdromc\ will make the transformation if -necessary. The return value is 0 upon success. - -\subsection{$Int\ get_mcn(struct\ cdrom_device_info * cdi, struct\ - cdrom_mcn * mcn)$} - -Some discs carry a `Media Catalog Number' (MCN), also called -`Universal Product Code' (UPC). This number should reflect the number -that is generally found in the bar-code on the product. Unfortunately, -the few discs that carry such a number on the disc don't even use the -same format. The return argument to this function is a pointer to a -pre-declared memory region of type $struct\ cdrom_mcn$. The MCN is -expected as a 13-character string, terminated by a null-character. - -\subsection{$Int\ reset(struct\ cdrom_device_info * cdi)$} - -This call should perform a hard-reset on the drive (although in -circumstances that a hard-reset is necessary, a drive may very well not -listen to commands anymore). Preferably, control is returned to the -caller only after the drive has finished resetting. If the drive is no -longer listening, it may be wise for the underlying low-level cdrom -driver to time out. - -\subsection{$Int\ audio_ioctl(struct\ cdrom_device_info * cdi, unsigned\ - int\ cmd, void * arg)$} - -Some of the \cdrom-$ioctl$s defined in \cdromh\ can be -implemented by the routines described above, and hence the function -$cdrom_ioctl$ will use those. However, most $ioctl$s deal with -audio-control. We have decided to leave these to be accessed through a -single function, repeating the arguments $cmd$ and $arg$. Note that -the latter is of type $void*{}$, rather than $unsigned\ long\ -int$. The routine $cdrom_ioctl()$ does do some useful things, -though. It sanitizes the address format type to $CDROM_MSF$ (Minutes, -Seconds, Frames) for all audio calls. It also verifies the memory -location of $arg$, and reserves stack-memory for the argument. This -makes implementation of the $audio_ioctl()$ much simpler than in the -old driver scheme. For example, you may look up the function -$cm206_audio_ioctl()$ in {\tt {cm206.c}} that should be updated with -this documentation. - -An unimplemented ioctl should return $-ENOSYS$, but a harmless request -(\eg, $CDROMSTART$) may be ignored by returning 0 (success). Other -errors should be according to the standards, whatever they are. When -an error is returned by the low-level driver, the \UCD\ tries whenever -possible to return the error code to the calling program. (We may decide -to sanitize the return value in $cdrom_ioctl()$ though, in order to -guarantee a uniform interface to the audio-player software.) - -\subsection{$Int\ dev_ioctl(struct\ cdrom_device_info * cdi, unsigned\ int\ - cmd, unsigned\ long\ arg)$} - -Some $ioctl$s seem to be specific to certain \cdrom\ drives. That is, -they are introduced to service some capabilities of certain drives. In -fact, there are 6 different $ioctl$s for reading data, either in some -particular kind of format, or audio data. Not many drives support -reading audio tracks as data, I believe this is because of protection -of copyrights of artists. Moreover, I think that if audio-tracks are -supported, it should be done through the VFS and not via $ioctl$s. A -problem here could be the fact that audio-frames are 2352 bytes long, -so either the audio-file-system should ask for 75264 bytes at once -(the least common multiple of 512 and 2352), or the drivers should -bend their backs to cope with this incoherence (to which I would be -opposed). Furthermore, it is very difficult for the hardware to find -the exact frame boundaries, since there are no synchronization headers -in audio frames. Once these issues are resolved, this code should be -standardized in \cdromc. - -Because there are so many $ioctl$s that seem to be introduced to -satisfy certain drivers,\footnote{Is there software around that - actually uses these? I'd be interested!} any `non-standard' $ioctl$s -are routed through the call $dev_ioctl()$. In principle, `private' -$ioctl$s should be numbered after the device's major number, and not -the general \cdrom\ $ioctl$ number, {\tt {0x53}}. Currently the -non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2, - CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK, - CDROMPLAY\-BLK and CDROM\-READALL}. - - -\subsection{\cdrom\ capabilities} -\label{capability} - -Instead of just implementing some $ioctl$ calls, the interface in -\cdromc\ supplies the possibility to indicate the {\em capabilities\/} -of a \cdrom\ drive. This can be done by ORing any number of -capability-constants that are defined in \cdromh\ at the registration -phase. Currently, the capabilities are any of: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDC_CLOSE_TRAY& can close tray by software control\cr -CDC_OPEN_TRAY& can open tray\cr -CDC_LOCK& can lock and unlock the door\cr -CDC_SELECT_SPEED& can select speed, in units of $\sim$150\,kB/s\cr -CDC_SELECT_DISC& drive is juke-box\cr -CDC_MULTI_SESSION& can read sessions $>\rm1$\cr -CDC_MCN& can read Media Catalog Number\cr -CDC_MEDIA_CHANGED& can report if disc has changed\cr -CDC_PLAY_AUDIO& can perform audio-functions (play, pause, etc)\cr -CDC_RESET& hard reset device\cr -CDC_IOCTLS& driver has non-standard ioctls\cr -CDC_DRIVE_STATUS& driver implements drive status\cr -} -$$ -The capability flag is declared $const$, to prevent drivers from -accidentally tampering with the contents. The capability fags actually -inform \cdromc\ of what the driver can do. If the drive found -by the driver does not have the capability, is can be masked out by -the $cdrom_device_info$ variable $mask$. For instance, the SCSI \cdrom\ -driver has implemented the code for loading and ejecting \cdrom's, and -hence its corresponding flags in $capability$ will be set. But a SCSI -\cdrom\ drive might be a caddy system, which can't load the tray, and -hence for this drive the $cdrom_device_info$ struct will have set -the $CDC_CLOSE_TRAY$ bit in $mask$. - -In the file \cdromc\ you will encounter many constructions of the type -$$\it -if\ (cdo\rightarrow capability \mathrel\& \mathord{\sim} cdi\rightarrow mask - \mathrel{\&} CDC_) \ldots -$$ -There is no $ioctl$ to set the mask\dots The reason is that -I think it is better to control the {\em behavior\/} rather than the -{\em capabilities}. - -\subsection{Options} - -A final flag register controls the {\em behavior\/} of the \cdrom\ -drives, in order to satisfy different users' wishes, hopefully -independently of the ideas of the respective author who happened to -have made the drive's support available to the \linux\ community. The -current behavior options are: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDO_AUTO_CLOSE& try to close tray upon device $open()$\cr -CDO_AUTO_EJECT& try to open tray on last device $close()$\cr -CDO_USE_FFLAGS& use $file_pointer\rightarrow f_flags$ to indicate - purpose for $open()$\cr -CDO_LOCK& try to lock door if device is opened\cr -CDO_CHECK_TYPE& ensure disc type is data if opened for data\cr -} -$$ - -The initial value of this register is $CDO_AUTO_CLOSE \mathrel| -CDO_USE_FFLAGS \mathrel| CDO_LOCK$, reflecting my own view on user -interface and software standards. Before you protest, there are two -new $ioctl$s implemented in \cdromc, that allow you to control the -behavior by software. These are: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDROM_SET_OPTIONS& set options specified in $(int)\ arg$\cr -CDROM_CLEAR_OPTIONS& clear options specified in $(int)\ arg$\cr -} -$$ -One option needs some more explanation: $CDO_USE_FFLAGS$. In the next -newsection we explain what the need for this option is. - -A software package {\tt setcd}, available from the Debian distribution -and {\tt sunsite.unc.edu}, allows user level control of these flags. - -\newsection{The need to know the purpose of opening the \cdrom\ device} - -Traditionally, Unix devices can be used in two different `modes', -either by reading/writing to the device file, or by issuing -controlling commands to the device, by the device's $ioctl()$ -call. The problem with \cdrom\ drives, is that they can be used for -two entirely different purposes. One is to mount removable -file systems, \cdrom s, the other is to play audio CD's. Audio commands -are implemented entirely through $ioctl$s, presumably because the -first implementation (SUN?) has been such. In principle there is -nothing wrong with this, but a good control of the `CD player' demands -that the device can {\em always\/} be opened in order to give the -$ioctl$ commands, regardless of the state the drive is in. - -On the other hand, when used as a removable-media disc drive (what the -original purpose of \cdrom s is) we would like to make sure that the -disc drive is ready for operation upon opening the device. In the old -scheme, some \cdrom\ drivers don't do any integrity checking, resulting -in a number of i/o errors reported by the VFS to the kernel when an -attempt for mounting a \cdrom\ on an empty drive occurs. This is not a -particularly elegant way to find out that there is no \cdrom\ inserted; -it more-or-less looks like the old IBM-PC trying to read an empty floppy -drive for a couple of seconds, after which the system complains it -can't read from it. Nowadays we can {\em sense\/} the existence of a -removable medium in a drive, and we believe we should exploit that -fact. An integrity check on opening of the device, that verifies the -availability of a \cdrom\ and its correct type (data), would be -desirable. - -These two ways of using a \cdrom\ drive, principally for data and -secondarily for playing audio discs, have different demands for the -behavior of the $open()$ call. Audio use simply wants to open the -device in order to get a file handle which is needed for issuing -$ioctl$ commands, while data use wants to open for correct and -reliable data transfer. The only way user programs can indicate what -their {\em purpose\/} of opening the device is, is through the $flags$ -parameter (see {\tt {open(2)}}). For \cdrom\ devices, these flags aren't -implemented (some drivers implement checking for write-related flags, -but this is not strictly necessary if the device file has correct -permission flags). Most option flags simply don't make sense to -\cdrom\ devices: $O_CREAT$, $O_NOCTTY$, $O_TRUNC$, $O_APPEND$, and -$O_SYNC$ have no meaning to a \cdrom. - -We therefore propose to use the flag $O_NONBLOCK$ to indicate -that the device is opened just for issuing $ioctl$ -commands. Strictly, the meaning of $O_NONBLOCK$ is that opening and -subsequent calls to the device don't cause the calling process to -wait. We could interpret this as ``don't wait until someone has -inserted some valid data-\cdrom.'' Thus, our proposal of the -implementation for the $open()$ call for \cdrom s is: -\begin{itemize} -\item If no other flags are set than $O_RDONLY$, the device is opened -for data transfer, and the return value will be 0 only upon successful -initialization of the transfer. The call may even induce some actions -on the \cdrom, such as closing the tray. -\item If the option flag $O_NONBLOCK$ is set, opening will always be -successful, unless the whole device doesn't exist. The drive will take -no actions whatsoever. -\end{itemize} - -\subsection{And what about standards?} - -You might hesitate to accept this proposal as it comes from the -\linux\ community, and not from some standardizing institute. What -about SUN, SGI, HP and all those other Unix and hardware vendors? -Well, these companies are in the lucky position that they generally -control both the hardware and software of their supported products, -and are large enough to set their own standard. They do not have to -deal with a dozen or more different, competing hardware -configurations.\footnote{Incidentally, I think that SUN's approach to -mounting \cdrom s is very good in origin: under Solaris a -volume-daemon automatically mounts a newly inserted \cdrom\ under {\tt -{/cdrom/$$/}}. In my opinion they should have pushed this -further and have {\em every\/} \cdrom\ on the local area network be -mounted at the similar location, \ie, no matter in which particular -machine you insert a \cdrom, it will always appear at the same -position in the directory tree, on every system. When I wanted to -implement such a user-program for \linux, I came across the -differences in behavior of the various drivers, and the need for an -$ioctl$ informing about media changes.} - -We believe that using $O_NONBLOCK$ to indicate that a device is being opened -for $ioctl$ commands only can be easily introduced in the \linux\ -community. All the CD-player authors will have to be informed, we can -even send in our own patches to the programs. The use of $O_NONBLOCK$ -has most likely no influence on the behavior of the CD-players on -other operating systems than \linux. Finally, a user can always revert -to old behavior by a call to $ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, -CDO_USE_FFLAGS)$. - -\subsection{The preferred strategy of $open()$} - -The routines in \cdromc\ are designed in such a way that run-time -configuration of the behavior of \cdrom\ devices (of {\em any\/} type) -can be carried out, by the $CDROM_SET/CLEAR_OPTIONS$ $ioctls$. Thus, various -modes of operation can be set: -\begin{description} -\item[$CDO_AUTO_CLOSE \mathrel| CDO_USE_FFLAGS \mathrel| CDO_LOCK$] This -is the default setting. (With $CDO_CHECK_TYPE$ it will be better, in the -future.) If the device is not yet opened by any other process, and if -the device is being opened for data ($O_NONBLOCK$ is not set) and the -tray is found to be open, an attempt to close the tray is made. Then, -it is verified that a disc is in the drive and, if $CDO_CHECK_TYPE$ is -set, that it contains tracks of type `data mode 1.' Only if all tests -are passed is the return value zero. The door is locked to prevent file -system corruption. If the drive is opened for audio ($O_NONBLOCK$ is -set), no actions are taken and a value of 0 will be returned. -\item[$CDO_AUTO_CLOSE \mathrel| CDO_AUTO_EJECT \mathrel| CDO_LOCK$] This -mimics the behavior of the current sbpcd-driver. The option flags are -ignored, the tray is closed on the first open, if necessary. Similarly, -the tray is opened on the last release, \ie, if a \cdrom\ is unmounted, -it is automatically ejected, such that the user can replace it. -\end{description} -We hope that these option can convince everybody (both driver -maintainers and user program developers) to adopt the new \cdrom\ -driver scheme and option flag interpretation. - -\newsection{Description of routines in \cdromc} - -Only a few routines in \cdromc\ are exported to the drivers. In this -new section we will discuss these, as well as the functions that `take -over' the \cdrom\ interface to the kernel. The header file belonging -to \cdromc\ is called \cdromh. Formerly, some of the contents of this -file were placed in the file {\tt {ucdrom.h}}, but this file has now been -merged back into \cdromh. - -\subsection{$Struct\ file_operations\ cdrom_fops$} - -The contents of this structure were described in section~\ref{cdrom.c}. -A pointer to this structure is assigned to the $fops$ field -of the $struct gendisk$. - -\subsection{$Int\ register_cdrom( struct\ cdrom_device_info\ * cdi)$} - -This function is used in about the same way one registers $cdrom_fops$ -with the kernel, the device operations and information structures, -as described in section~\ref{cdrom.c}, should be registered with the -\UCD: -$$ -register_cdrom(\&_info)); -$$ -This function returns zero upon success, and non-zero upon -failure. The structure $_info$ should have a pointer to the -driver's $_dops$, as in -$$ -\vbox{\halign{&$#$\hfil\cr -struct\ &cdrom_device_info\ _info = \{\cr -& _dops;\cr -&\ldots\cr -\}\cr -}}$$ -Note that a driver must have one static structure, $_dops$, while -it may have as many structures $_info$ as there are minor devices -active. $Register_cdrom()$ builds a linked list from these. - -\subsection{$Void\ unregister_cdrom(struct\ cdrom_device_info * cdi)$} - -Unregistering device $cdi$ with minor number $MINOR(cdi\to dev)$ removes -the minor device from the list. If it was the last registered minor for -the low-level driver, this disconnects the registered device-operation -routines from the \cdrom\ interface. This function returns zero upon -success, and non-zero upon failure. - -\subsection{$Int\ cdrom_open(struct\ inode * ip, struct\ file * fp)$} - -This function is not called directly by the low-level drivers, it is -listed in the standard $cdrom_fops$. If the VFS opens a file, this -function becomes active. A strategy is implemented in this routine, -taking care of all capabilities and options that are set in the -$cdrom_device_ops$ connected to the device. Then, the program flow is -transferred to the device_dependent $open()$ call. - -\subsection{$Void\ cdrom_release(struct\ inode *ip, struct\ file -*fp)$} - -This function implements the reverse-logic of $cdrom_open()$, and then -calls the device-dependent $release()$ routine. When the use-count has -reached 0, the allocated buffers are flushed by calls to $sync_dev(dev)$ -and $invalidate_buffers(dev)$. - - -\subsection{$Int\ cdrom_ioctl(struct\ inode *ip, struct\ file *fp, -unsigned\ int\ cmd, unsigned\ long\ arg)$} -\label{cdrom-ioctl} - -This function handles all the standard $ioctl$ requests for \cdrom\ -devices in a uniform way. The different calls fall into three -categories: $ioctl$s that can be directly implemented by device -operations, ones that are routed through the call $audio_ioctl()$, and -the remaining ones, that are presumable device-dependent. Generally, a -negative return value indicates an error. - -\subsubsection{Directly implemented $ioctl$s} -\label{ioctl-direct} - -The following `old' \cdrom-$ioctl$s are implemented by directly -calling device-operations in $cdrom_device_ops$, if implemented and -not masked: -\begin{description} -\item[CDROMMULTISESSION] Requests the last session on a \cdrom. -\item[CDROMEJECT] Open tray. -\item[CDROMCLOSETRAY] Close tray. -\item[CDROMEJECT_SW] If $arg\not=0$, set behavior to auto-close (close -tray on first open) and auto-eject (eject on last release), otherwise -set behavior to non-moving on $open()$ and $release()$ calls. -\item[CDROM_GET_MCN] Get the Media Catalog Number from a CD. -\end{description} - -\subsubsection{$Ioctl$s routed through $audio_ioctl()$} -\label{ioctl-audio} - -The following set of $ioctl$s are all implemented through a call to -the $cdrom_fops$ function $audio_ioctl()$. Memory checks and -allocation are performed in $cdrom_ioctl()$, and also sanitization of -address format ($CDROM_LBA$/$CDROM_MSF$) is done. -\begin{description} -\item[CDROMSUBCHNL] Get sub-channel data in argument $arg$ of type $struct\ -cdrom_subchnl *{}$. -\item[CDROMREADTOCHDR] Read Table of Contents header, in $arg$ of type -$struct\ cdrom_tochdr *{}$. -\item[CDROMREADTOCENTRY] Read a Table of Contents entry in $arg$ and -specified by $arg$ of type $struct\ cdrom_tocentry *{}$. -\item[CDROMPLAYMSF] Play audio fragment specified in Minute, Second, -Frame format, delimited by $arg$ of type $struct\ cdrom_msf *{}$. -\item[CDROMPLAYTRKIND] Play audio fragment in track-index format -delimited by $arg$ of type $struct\ \penalty-1000 cdrom_ti *{}$. -\item[CDROMVOLCTRL] Set volume specified by $arg$ of type $struct\ -cdrom_volctrl *{}$. -\item[CDROMVOLREAD] Read volume into by $arg$ of type $struct\ -cdrom_volctrl *{}$. -\item[CDROMSTART] Spin up disc. -\item[CDROMSTOP] Stop playback of audio fragment. -\item[CDROMPAUSE] Pause playback of audio fragment. -\item[CDROMRESUME] Resume playing. -\end{description} - -\subsubsection{New $ioctl$s in \cdromc} - -The following $ioctl$s have been introduced to allow user programs to -control the behavior of individual \cdrom\ devices. New $ioctl$ -commands can be identified by the underscores in their names. -\begin{description} -\item[CDROM_SET_OPTIONS] Set options specified by $arg$. Returns the -option flag register after modification. Use $arg = \rm0$ for reading -the current flags. -\item[CDROM_CLEAR_OPTIONS] Clear options specified by $arg$. Returns - the option flag register after modification. -\item[CDROM_SELECT_SPEED] Select head-rate speed of disc specified as - by $arg$ in units of standard cdrom speed (176\,kB/sec raw data or - 150\,kB/sec file system data). The value 0 means `auto-select', \ie, - play audio discs at real time and data discs at maximum speed. The value - $arg$ is checked against the maximum head rate of the drive found in the - $cdrom_dops$. -\item[CDROM_SELECT_DISC] Select disc numbered $arg$ from a juke-box. - First disc is numbered 0. The number $arg$ is checked against the - maximum number of discs in the juke-box found in the $cdrom_dops$. -\item[CDROM_MEDIA_CHANGED] Returns 1 if a disc has been changed since - the last call. Note that calls to $cdrom_media_changed$ by the VFS - are treated by an independent queue, so both mechanisms will detect - a media change once. For juke-boxes, an extra argument $arg$ - specifies the slot for which the information is given. The special - value $CDSL_CURRENT$ requests that information about the currently - selected slot be returned. -\item[CDROM_DRIVE_STATUS] Returns the status of the drive by a call to - $drive_status()$. Return values are defined in section~\ref{drive - status}. Note that this call doesn't return information on the - current playing activity of the drive; this can be polled through an - $ioctl$ call to $CDROMSUBCHNL$. For juke-boxes, an extra argument - $arg$ specifies the slot for which (possibly limited) information is - given. The special value $CDSL_CURRENT$ requests that information - about the currently selected slot be returned. -\item[CDROM_DISC_STATUS] Returns the type of the disc currently in the - drive. It should be viewed as a complement to $CDROM_DRIVE_STATUS$. - This $ioctl$ can provide \emph {some} information about the current - disc that is inserted in the drive. This functionality used to be - implemented in the low level drivers, but is now carried out - entirely in \UCD. - - The history of development of the CD's use as a carrier medium for - various digital information has lead to many different disc types. - This $ioctl$ is useful only in the case that CDs have \emph {only - one} type of data on them. While this is often the case, it is - also very common for CDs to have some tracks with data, and some - tracks with audio. Because this is an existing interface, rather - than fixing this interface by changing the assumptions it was made - under, thereby breaking all user applications that use this - function, the \UCD\ implements this $ioctl$ as follows: If the CD in - question has audio tracks on it, and it has absolutely no CD-I, XA, - or data tracks on it, it will be reported as $CDS_AUDIO$. If it has - both audio and data tracks, it will return $CDS_MIXED$. If there - are no audio tracks on the disc, and if the CD in question has any - CD-I tracks on it, it will be reported as $CDS_XA_2_2$. Failing - that, if the CD in question has any XA tracks on it, it will be - reported as $CDS_XA_2_1$. Finally, if the CD in question has any - data tracks on it, it will be reported as a data CD ($CDS_DATA_1$). - - This $ioctl$ can return: - $$ - \halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr - CDS_NO_INFO& no information available\cr - CDS_NO_DISC& no disc is inserted, or tray is opened\cr - CDS_AUDIO& Audio disc (2352 audio bytes/frame)\cr - CDS_DATA_1& data disc, mode 1 (2048 user bytes/frame)\cr - CDS_XA_2_1& mixed data (XA), mode 2, form 1 (2048 user bytes)\cr - CDS_XA_2_2& mixed data (XA), mode 2, form 1 (2324 user bytes)\cr - CDS_MIXED& mixed audio/data disc\cr - } - $$ - For some information concerning frame layout of the various disc - types, see a recent version of \cdromh. - -\item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a - juke-box. -\item[CDROMRESET] Reset the drive. -\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the - drive. Refer to section \ref{capability} for more information on - these flags. -\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$ - unlocks the door, any other value locks it. -\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed - to do this. Same semantics as CDROM_LOCKDOOR. -\end{description} - -\subsubsection{Device dependent $ioctl$s} - -Finally, all other $ioctl$s are passed to the function $dev_ioctl()$, -if implemented. No memory allocation or verification is carried out. - -\newsection{How to update your driver} - -\begin{enumerate} -\item Make a backup of your current driver. -\item Get hold of the files \cdromc\ and \cdromh, they should be in - the directory tree that came with this documentation. -\item Make sure you include \cdromh. -\item Change the 3rd argument of $register_blkdev$ from -$\&_fops$ to $\&cdrom_fops$. -\item Just after that line, add the following to register with the \UCD: - $$register_cdrom(\&_info);$$ - Similarly, add a call to $unregister_cdrom()$ at the appropriate place. -\item Copy an example of the device-operations $struct$ to your - source, \eg, from {\tt {cm206.c}} $cm206_dops$, and change all - entries to names corresponding to your driver, or names you just - happen to like. If your driver doesn't support a certain function, - make the entry $NULL$. At the entry $capability$ you should list all - capabilities your driver currently supports. If your driver - has a capability that is not listed, please send me a message. -\item Copy the $cdrom_device_info$ declaration from the same example - driver, and modify the entries according to your needs. If your - driver dynamically determines the capabilities of the hardware, this - structure should also be declared dynamically. -\item Implement all functions in your $_dops$ structure, - according to prototypes listed in \cdromh, and specifications given - in section~\ref{cdrom.c}. Most likely you have already implemented - the code in a large part, and you will almost certainly need to adapt the - prototype and return values. -\item Rename your $_ioctl()$ function to $audio_ioctl$ and - change the prototype a little. Remove entries listed in the first - part in section~\ref{cdrom-ioctl}, if your code was OK, these are - just calls to the routines you adapted in the previous step. -\item You may remove all remaining memory checking code in the - $audio_ioctl()$ function that deals with audio commands (these are - listed in the second part of section~\ref{cdrom-ioctl}). There is no - need for memory allocation either, so most $case$s in the $switch$ - statement look similar to: - $$ - case\ CDROMREADTOCENTRY\colon get_toc_entry\bigl((struct\ - cdrom_tocentry *{})\ arg\bigr); - $$ -\item All remaining $ioctl$ cases must be moved to a separate - function, $_ioctl$, the device-dependent $ioctl$s. Note that - memory checking and allocation must be kept in this code! -\item Change the prototypes of $_open()$ and - $_release()$, and remove any strategic code (\ie, tray - movement, door locking, etc.). -\item Try to recompile the drivers. We advise you to use modules, both - for {\tt {cdrom.o}} and your driver, as debugging is much easier this - way. -\end{enumerate} - -\newsection{Thanks} - -Thanks to all the people involved. First, Erik Andersen, who has -taken over the torch in maintaining \cdromc\ and integrating much -\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and -Gerd Knorr, who were the first to implement this interface for SCSI -and IDE-CD drivers and added many ideas for extension of the data -structures relative to kernel~2.0. Further thanks to Heiko Ei{\ss}feldt, -Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew -Kroll, the \linux\ \cdrom\ device driver developers who were kind -enough to give suggestions and criticisms during the writing. Finally -of course, I want to thank Linus Torvalds for making this possible in -the first place. - -\vfill -$ \version\ $ -\eject -\end{document} diff --git a/Documentation/cdrom/cdrom-standard.txt b/Documentation/cdrom/cdrom-standard.txt new file mode 100644 index 000000000000..dde4f7f7fdbf --- /dev/null +++ b/Documentation/cdrom/cdrom-standard.txt @@ -0,0 +1,1063 @@ +======================= +A Linux CD-ROM standard +======================= + +:Author: David van Leeuwen +:Date: 12 March 1999 +:Updated by: Erik Andersen (andersee@debian.org) +:Updated by: Jens Axboe (axboe@image.dk) + + +Introduction +============ + +Linux is probably the Unix-like operating system that supports +the widest variety of hardware devices. The reasons for this are +presumably + +- The large list of hardware devices available for the many platforms + that Linux now supports (i.e., i386-PCs, Sparc Suns, etc.) +- The open design of the operating system, such that anybody can write a + driver for Linux. +- There is plenty of source code around as examples of how to write a driver. + +The openness of Linux, and the many different types of available +hardware has allowed Linux to support many different hardware devices. +Unfortunately, the very openness that has allowed Linux to support +all these different devices has also allowed the behavior of each +device driver to differ significantly from one device to another. +This divergence of behavior has been very significant for CD-ROM +devices; the way a particular drive reacts to a `standard` *ioctl()* +call varies greatly from one device driver to another. To avoid making +their drivers totally inconsistent, the writers of Linux CD-ROM +drivers generally created new device drivers by understanding, copying, +and then changing an existing one. Unfortunately, this practice did not +maintain uniform behavior across all the Linux CD-ROM drivers. + +This document describes an effort to establish Uniform behavior across +all the different CD-ROM device drivers for Linux. This document also +defines the various *ioctl()'s*, and how the low-level CD-ROM device +drivers should implement them. Currently (as of the Linux 2.1.\ *x* +development kernels) several low-level CD-ROM device drivers, including +both IDE/ATAPI and SCSI, now use this Uniform interface. + +When the CD-ROM was developed, the interface between the CD-ROM drive +and the computer was not specified in the standards. As a result, many +different CD-ROM interfaces were developed. Some of them had their +own proprietary design (Sony, Mitsumi, Panasonic, Philips), other +manufacturers adopted an existing electrical interface and changed +the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply +adapted their drives to one or more of the already existing electrical +interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and +most of the `NoName` manufacturers). In cases where a new drive really +brought its own interface or used its own command set and flow control +scheme, either a separate driver had to be written, or an existing +driver had to be enhanced. History has delivered us CD-ROM support for +many of these different interfaces. Nowadays, almost all new CD-ROM +drives are either IDE/ATAPI or SCSI, and it is very unlikely that any +manufacturer will create a new interface. Even finding drives for the +old proprietary interfaces is getting difficult. + +When (in the 1.3.70's) I looked at the existing software interface, +which was expressed through `cdrom.h`, it appeared to be a rather wild +set of commands and data formats [#f1]_. It seemed that many +features of the software interface had been added to accommodate the +capabilities of a particular drive, in an *ad hoc* manner. More +importantly, it appeared that the behavior of the `standard` commands +was different for most of the different drivers: e. g., some drivers +close the tray if an *open()* call occurs when the tray is open, while +others do not. Some drivers lock the door upon opening the device, to +prevent an incoherent file system, but others don't, to allow software +ejection. Undoubtedly, the capabilities of the different drives vary, +but even when two drives have the same capability their drivers' +behavior was usually different. + +.. [#f1] + I cannot recollect what kernel version I looked at, then, + presumably 1.2.13 and 1.3.34 --- the latest kernel that I was + indirectly involved in. + +I decided to start a discussion on how to make all the Linux CD-ROM +drivers behave more uniformly. I began by contacting the developers of +the many CD-ROM drivers found in the Linux kernel. Their reactions +encouraged me to write the Uniform CD-ROM Driver which this document is +intended to describe. The implementation of the Uniform CD-ROM Driver is +in the file `cdrom.c`. This driver is intended to be an additional software +layer that sits on top of the low-level device drivers for each CD-ROM drive. +By adding this additional layer, it is possible to have all the different +CD-ROM devices behave **exactly** the same (insofar as the underlying +hardware will allow). + +The goal of the Uniform CD-ROM Driver is **not** to alienate driver developers +whohave not yet taken steps to support this effort. The goal of Uniform CD-ROM +Driver is simply to give people writing application programs for CD-ROM drives +**one** Linux CD-ROM interface with consistent behavior for all +CD-ROM devices. In addition, this also provides a consistent interface +between the low-level device driver code and the Linux kernel. Care +is taken that 100% compatibility exists with the data structures and +programmer's interface defined in `cdrom.h`. This guide was written to +help CD-ROM driver developers adapt their code to use the Uniform CD-ROM +Driver code defined in `cdrom.c`. + +Personally, I think that the most important hardware interfaces are +the IDE/ATAPI drives and, of course, the SCSI drives, but as prices +of hardware drop continuously, it is also likely that people may have +more than one CD-ROM drive, possibly of mixed types. It is important +that these drives behave in the same way. In December 1994, one of the +cheapest CD-ROM drives was a Philips cm206, a double-speed proprietary +drive. In the months that I was busy writing a Linux driver for it, +proprietary drives became obsolete and IDE/ATAPI drives became the +standard. At the time of the last update to this document (November +1997) it is becoming difficult to even **find** anything less than a +16 speed CD-ROM drive, and 24 speed drives are common. + +.. _cdrom_api: + +Standardizing through another software level +============================================ + +At the time this document was conceived, all drivers directly +implemented the CD-ROM *ioctl()* calls through their own routines. This +led to the danger of different drivers forgetting to do important things +like checking that the user was giving the driver valid data. More +importantly, this led to the divergence of behavior, which has already +been discussed. + +For this reason, the Uniform CD-ROM Driver was created to enforce consistent +CD-ROM drive behavior, and to provide a common set of services to the various +low-level CD-ROM device drivers. The Uniform CD-ROM Driver now provides another +software-level, that separates the *ioctl()* and *open()* implementation +from the actual hardware implementation. Note that this effort has +made few changes which will affect a user's application programs. The +greatest change involved moving the contents of the various low-level +CD-ROM drivers\' header files to the kernel's cdrom directory. This was +done to help ensure that the user is only presented with only one cdrom +interface, the interface defined in `cdrom.h`. + +CD-ROM drives are specific enough (i. e., different from other +block-devices such as floppy or hard disc drives), to define a set +of common **CD-ROM device operations**, *_dops*. +These operations are different from the classical block-device file +operations, *_fops*. + +The routines for the Uniform CD-ROM Driver interface level are implemented +in the file `cdrom.c`. In this file, the Uniform CD-ROM Driver interfaces +with the kernel as a block device by registering the following general +*struct file_operations*:: + + struct file_operations cdrom_fops = { + NULL, /∗ lseek ∗/ + block _read , /∗ read—general block-dev read ∗/ + block _write, /∗ write—general block-dev write ∗/ + NULL, /∗ readdir ∗/ + NULL, /∗ select ∗/ + cdrom_ioctl, /∗ ioctl ∗/ + NULL, /∗ mmap ∗/ + cdrom_open, /∗ open ∗/ + cdrom_release, /∗ release ∗/ + NULL, /∗ fsync ∗/ + NULL, /∗ fasync ∗/ + cdrom_media_changed, /∗ media change ∗/ + NULL /∗ revalidate ∗/ + }; + +Every active CD-ROM device shares this *struct*. The routines +declared above are all implemented in `cdrom.c`, since this file is the +place where the behavior of all CD-ROM-devices is defined and +standardized. The actual interface to the various types of CD-ROM +hardware is still performed by various low-level CD-ROM-device +drivers. These routines simply implement certain **capabilities** +that are common to all CD-ROM (and really, all removable-media +devices). + +Registration of a low-level CD-ROM device driver is now done through +the general routines in `cdrom.c`, not through the Virtual File System +(VFS) any more. The interface implemented in `cdrom.c` is carried out +through two general structures that contain information about the +capabilities of the driver, and the specific drives on which the +driver operates. The structures are: + +cdrom_device_ops + This structure contains information about the low-level driver for a + CD-ROM device. This structure is conceptually connected to the major + number of the device (although some drivers may have different + major numbers, as is the case for the IDE driver). + +cdrom_device_info + This structure contains information about a particular CD-ROM drive, + such as its device name, speed, etc. This structure is conceptually + connected to the minor number of the device. + +Registering a particular CD-ROM drive with the Uniform CD-ROM Driver +is done by the low-level device driver though a call to:: + + register_cdrom(struct cdrom_device_info * _info) + +The device information structure, *_info*, contains all the +information needed for the kernel to interface with the low-level +CD-ROM device driver. One of the most important entries in this +structure is a pointer to the *cdrom_device_ops* structure of the +low-level driver. + +The device operations structure, *cdrom_device_ops*, contains a list +of pointers to the functions which are implemented in the low-level +device driver. When `cdrom.c` accesses a CD-ROM device, it does it +through the functions in this structure. It is impossible to know all +the capabilities of future CD-ROM drives, so it is expected that this +list may need to be expanded from time to time as new technologies are +developed. For example, CD-R and CD-R/W drives are beginning to become +popular, and support will soon need to be added for them. For now, the +current *struct* is:: + + struct cdrom_device_ops { + int (*open)(struct cdrom_device_info *, int) + void (*release)(struct cdrom_device_info *); + int (*drive_status)(struct cdrom_device_info *, int); + unsigned int (*check_events)(struct cdrom_device_info *, + unsigned int, int); + int (*media_changed)(struct cdrom_device_info *, int); + int (*tray_move)(struct cdrom_device_info *, int); + int (*lock_door)(struct cdrom_device_info *, int); + int (*select_speed)(struct cdrom_device_info *, int); + int (*select_disc)(struct cdrom_device_info *, int); + int (*get_last_session) (struct cdrom_device_info *, + struct cdrom_multisession *); + int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); + int (*reset)(struct cdrom_device_info *); + int (*audio_ioctl)(struct cdrom_device_info *, + unsigned int, void *); + const int capability; /* capability flags */ + int (*generic_packet)(struct cdrom_device_info *, + struct packet_command *); + }; + +When a low-level device driver implements one of these capabilities, +it should add a function pointer to this *struct*. When a particular +function is not implemented, however, this *struct* should contain a +NULL instead. The *capability* flags specify the capabilities of the +CD-ROM hardware and/or low-level CD-ROM driver when a CD-ROM drive +is registered with the Uniform CD-ROM Driver. + +Note that most functions have fewer parameters than their +*blkdev_fops* counterparts. This is because very little of the +information in the structures *inode* and *file* is used. For most +drivers, the main parameter is the *struct* *cdrom_device_info*, from +which the major and minor number can be extracted. (Most low-level +CD-ROM drivers don't even look at the major and minor number though, +since many of them only support one device.) This will be available +through *dev* in *cdrom_device_info* described below. + +The drive-specific, minor-like information that is registered with +`cdrom.c`, currently contains the following fields:: + + struct cdrom_device_info { + const struct cdrom_device_ops * ops; /* device operations for this major */ + struct list_head list; /* linked list of all device_info */ + struct gendisk * disk; /* matching block layer disk */ + void * handle; /* driver-dependent data */ + + int mask; /* mask of capability: disables them */ + int speed; /* maximum speed for reading data */ + int capacity; /* number of discs in a jukebox */ + + unsigned int options:30; /* options flags */ + unsigned mc_flags:2; /* media-change buffer flags */ + unsigned int vfs_events; /* cached events for vfs path */ + unsigned int ioctl_events; /* cached events for ioctl path */ + int use_count; /* number of times device is opened */ + char name[20]; /* name of the device type */ + + __u8 sanyo_slot : 2; /* Sanyo 3-CD changer support */ + __u8 keeplocked : 1; /* CDROM_LOCKDOOR status */ + __u8 reserved : 5; /* not used yet */ + int cdda_method; /* see CDDA_* flags */ + __u8 last_sense; /* saves last sense key */ + __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ + unsigned short mmc3_profile; /* current MMC3 profile */ + int for_data; /* unknown:TBD */ + int (*exit)(struct cdrom_device_info *);/* unknown:TBD */ + int mrw_mode_page; /* which MRW mode page is in use */ + }; + +Using this *struct*, a linked list of the registered minor devices is +built, using the *next* field. The device number, the device operations +struct and specifications of properties of the drive are stored in this +structure. + +The *mask* flags can be used to mask out some of the capabilities listed +in *ops->capability*, if a specific drive doesn't support a feature +of the driver. The value *speed* specifies the maximum head-rate of the +drive, measured in units of normal audio speed (176kB/sec raw data or +150kB/sec file system data). The parameters are declared *const* +because they describe properties of the drive, which don't change after +registration. + +A few registers contain variables local to the CD-ROM drive. The +flags *options* are used to specify how the general CD-ROM routines +should behave. These various flags registers should provide enough +flexibility to adapt to the different users' wishes (and **not** the +`arbitrary` wishes of the author of the low-level device driver, as is +the case in the old scheme). The register *mc_flags* is used to buffer +the information from *media_changed()* to two separate queues. Other +data that is specific to a minor drive, can be accessed through *handle*, +which can point to a data structure specific to the low-level driver. +The fields *use_count*, *next*, *options* and *mc_flags* need not be +initialized. + +The intermediate software layer that `cdrom.c` forms will perform some +additional bookkeeping. The use count of the device (the number of +processes that have the device opened) is registered in *use_count*. The +function *cdrom_ioctl()* will verify the appropriate user-memory regions +for read and write, and in case a location on the CD is transferred, +it will `sanitize` the format by making requests to the low-level +drivers in a standard format, and translating all formats between the +user-software and low level drivers. This relieves much of the drivers' +memory checking and format checking and translation. Also, the necessary +structures will be declared on the program stack. + +The implementation of the functions should be as defined in the +following sections. Two functions **must** be implemented, namely +*open()* and *release()*. Other functions may be omitted, their +corresponding capability flags will be cleared upon registration. +Generally, a function returns zero on success and negative on error. A +function call should return only after the command has completed, but of +course waiting for the device should not use processor time. + +:: + + int open(struct cdrom_device_info *cdi, int purpose) + +*Open()* should try to open the device for a specific *purpose*, which +can be either: + +- Open for reading data, as done by `mount()` (2), or the + user commands `dd` or `cat`. +- Open for *ioctl* commands, as done by audio-CD playing programs. + +Notice that any strategic code (closing tray upon *open()*, etc.) is +done by the calling routine in `cdrom.c`, so the low-level routine +should only be concerned with proper initialization, such as spinning +up the disc, etc. + +:: + + void release(struct cdrom_device_info *cdi) + +Device-specific actions should be taken such as spinning down the device. +However, strategic actions such as ejection of the tray, or unlocking +the door, should be left over to the general routine *cdrom_release()*. +This is the only function returning type *void*. + +.. _cdrom_drive_status: + +:: + + int drive_status(struct cdrom_device_info *cdi, int slot_nr) + +The function *drive_status*, if implemented, should provide +information on the status of the drive (not the status of the disc, +which may or may not be in the drive). If the drive is not a changer, +*slot_nr* should be ignored. In `cdrom.h` the possibilities are listed:: + + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, tray is closed */ + CDS_TRAY_OPEN /* tray is opened */ + CDS_DRIVE_NOT_READY /* something is wrong, tray is moving? */ + CDS_DISC_OK /* a disc is loaded and everything is fine */ + +:: + + int media_changed(struct cdrom_device_info *cdi, int disc_nr) + +This function is very similar to the original function in $struct +file_operations*. It returns 1 if the medium of the device *cdi->dev* +has changed since the last call, and 0 otherwise. The parameter +*disc_nr* identifies a specific slot in a juke-box, it should be +ignored for single-disc drives. Note that by `re-routing` this +function through *cdrom_media_changed()*, we can implement separate +queues for the VFS and a new *ioctl()* function that can report device +changes to software (e. g., an auto-mounting daemon). + +:: + + int tray_move(struct cdrom_device_info *cdi, int position) + +This function, if implemented, should control the tray movement. (No +other function should control this.) The parameter *position* controls +the desired direction of movement: + +- 0 Close tray +- 1 Open tray + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the tray is already in the desired position, no +action need be taken, and the return value should be 0. + +:: + + int lock_door(struct cdrom_device_info *cdi, int lock) + +This function (and no other code) controls locking of the door, if the +drive allows this. The value of *lock* controls the desired locking +state: + +- 0 Unlock door, manual opening is allowed +- 1 Lock door, tray cannot be ejected manually + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the door is already in the requested state, no +action need be taken, and the return value should be 0. + +:: + + int select_speed(struct cdrom_device_info *cdi, int speed) + +Some CD-ROM drives are capable of changing their head-speed. There +are several reasons for changing the speed of a CD-ROM drive. Badly +pressed CD-ROM s may benefit from less-than-maximum head rate. Modern +CD-ROM drives can obtain very high head rates (up to *24x* is +common). It has been reported that these drives can make reading +errors at these high speeds, reducing the speed can prevent data loss +in these circumstances. Finally, some of these drives can +make an annoyingly loud noise, which a lower speed may reduce. + +This function specifies the speed at which data is read or audio is +played back. The value of *speed* specifies the head-speed of the +drive, measured in units of standard cdrom speed (176kB/sec raw data +or 150kB/sec file system data). So to request that a CD-ROM drive +operate at 300kB/sec you would call the CDROM_SELECT_SPEED *ioctl* +with *speed=2*. The special value `0` means `auto-selection`, i. e., +maximum data-rate or real-time audio rate. If the drive doesn't have +this `auto-selection` capability, the decision should be made on the +current disc loaded and the return value should be positive. A negative +return value indicates an error. + +:: + + int select_disc(struct cdrom_device_info *cdi, int number) + +If the drive can store multiple discs (a juke-box) this function +will perform disc selection. It should return the number of the +selected disc on success, a negative value on error. Currently, only +the ide-cd driver supports this functionality. + +:: + + int get_last_session(struct cdrom_device_info *cdi, + struct cdrom_multisession *ms_info) + +This function should implement the old corresponding *ioctl()*. For +device *cdi->dev*, the start of the last session of the current disc +should be returned in the pointer argument *ms_info*. Note that +routines in `cdrom.c` have sanitized this argument: its requested +format will **always** be of the type *CDROM_LBA* (linear block +addressing mode), whatever the calling software requested. But +sanitization goes even further: the low-level implementation may +return the requested information in *CDROM_MSF* format if it wishes so +(setting the *ms_info->addr_format* field appropriately, of +course) and the routines in `cdrom.c` will make the transformation if +necessary. The return value is 0 upon success. + +:: + + int get_mcn(struct cdrom_device_info *cdi, + struct cdrom_mcn *mcn) + +Some discs carry a `Media Catalog Number` (MCN), also called +`Universal Product Code` (UPC). This number should reflect the number +that is generally found in the bar-code on the product. Unfortunately, +the few discs that carry such a number on the disc don't even use the +same format. The return argument to this function is a pointer to a +pre-declared memory region of type *struct cdrom_mcn*. The MCN is +expected as a 13-character string, terminated by a null-character. + +:: + + int reset(struct cdrom_device_info *cdi) + +This call should perform a hard-reset on the drive (although in +circumstances that a hard-reset is necessary, a drive may very well not +listen to commands anymore). Preferably, control is returned to the +caller only after the drive has finished resetting. If the drive is no +longer listening, it may be wise for the underlying low-level cdrom +driver to time out. + +:: + + int audio_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, void *arg) + +Some of the CD-ROM-\ *ioctl()*\ 's defined in `cdrom.h` can be +implemented by the routines described above, and hence the function +*cdrom_ioctl* will use those. However, most *ioctl()*\ 's deal with +audio-control. We have decided to leave these to be accessed through a +single function, repeating the arguments *cmd* and *arg*. Note that +the latter is of type *void*, rather than *unsigned long int*. +The routine *cdrom_ioctl()* does do some useful things, +though. It sanitizes the address format type to *CDROM_MSF* (Minutes, +Seconds, Frames) for all audio calls. It also verifies the memory +location of *arg*, and reserves stack-memory for the argument. This +makes implementation of the *audio_ioctl()* much simpler than in the +old driver scheme. For example, you may look up the function +*cm206_audio_ioctl()* `cm206.c` that should be updated with +this documentation. + +An unimplemented ioctl should return *-ENOSYS*, but a harmless request +(e. g., *CDROMSTART*) may be ignored by returning 0 (success). Other +errors should be according to the standards, whatever they are. When +an error is returned by the low-level driver, the Uniform CD-ROM Driver +tries whenever possible to return the error code to the calling program. +(We may decide to sanitize the return value in *cdrom_ioctl()* though, in +order to guarantee a uniform interface to the audio-player software.) + +:: + + int dev_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, unsigned long arg) + +Some *ioctl()'s* seem to be specific to certain CD-ROM drives. That is, +they are introduced to service some capabilities of certain drives. In +fact, there are 6 different *ioctl()'s* for reading data, either in some +particular kind of format, or audio data. Not many drives support +reading audio tracks as data, I believe this is because of protection +of copyrights of artists. Moreover, I think that if audio-tracks are +supported, it should be done through the VFS and not via *ioctl()'s*. A +problem here could be the fact that audio-frames are 2352 bytes long, +so either the audio-file-system should ask for 75264 bytes at once +(the least common multiple of 512 and 2352), or the drivers should +bend their backs to cope with this incoherence (to which I would be +opposed). Furthermore, it is very difficult for the hardware to find +the exact frame boundaries, since there are no synchronization headers +in audio frames. Once these issues are resolved, this code should be +standardized in `cdrom.c`. + +Because there are so many *ioctl()'s* that seem to be introduced to +satisfy certain drivers [#f2]_, any non-standard *ioctl()*\ s +are routed through the call *dev_ioctl()*. In principle, `private` +*ioctl()*\ 's should be numbered after the device's major number, and not +the general CD-ROM *ioctl* number, `0x53`. Currently the +non-supported *ioctl()'s* are: + + CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW, + CDROMREADCOOKED, CDROMSEEK, CDROMPLAY-BLK and CDROM-READALL + +.. [#f2] + + Is there software around that actually uses these? I'd be interested! + +.. _cdrom_capabilities: + +CD-ROM capabilities +------------------- + +Instead of just implementing some *ioctl* calls, the interface in +`cdrom.c` supplies the possibility to indicate the **capabilities** +of a CD-ROM drive. This can be done by ORing any number of +capability-constants that are defined in `cdrom.h` at the registration +phase. Currently, the capabilities are any of:: + + CDC_CLOSE_TRAY /* can close tray by software control */ + CDC_OPEN_TRAY /* can open tray */ + CDC_LOCK /* can lock and unlock the door */ + CDC_SELECT_SPEED /* can select speed, in units of * sim*150 ,kB/s */ + CDC_SELECT_DISC /* drive is juke-box */ + CDC_MULTI_SESSION /* can read sessions *> rm1* */ + CDC_MCN /* can read Media Catalog Number */ + CDC_MEDIA_CHANGED /* can report if disc has changed */ + CDC_PLAY_AUDIO /* can perform audio-functions (play, pause, etc) */ + CDC_RESET /* hard reset device */ + CDC_IOCTLS /* driver has non-standard ioctls */ + CDC_DRIVE_STATUS /* driver implements drive status */ + +The capability flag is declared *const*, to prevent drivers from +accidentally tampering with the contents. The capability fags actually +inform `cdrom.c` of what the driver can do. If the drive found +by the driver does not have the capability, is can be masked out by +the *cdrom_device_info* variable *mask*. For instance, the SCSI CD-ROM +driver has implemented the code for loading and ejecting CD-ROM's, and +hence its corresponding flags in *capability* will be set. But a SCSI +CD-ROM drive might be a caddy system, which can't load the tray, and +hence for this drive the *cdrom_device_info* struct will have set +the *CDC_CLOSE_TRAY* bit in *mask*. + +In the file `cdrom.c` you will encounter many constructions of the type:: + + if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ... + +There is no *ioctl* to set the mask... The reason is that +I think it is better to control the **behavior** rather than the +**capabilities**. + +Options +------- + +A final flag register controls the **behavior** of the CD-ROM +drives, in order to satisfy different users' wishes, hopefully +independently of the ideas of the respective author who happened to +have made the drive's support available to the Linux community. The +current behavior options are:: + + CDO_AUTO_CLOSE /* try to close tray upon device open() */ + CDO_AUTO_EJECT /* try to open tray on last device close() */ + CDO_USE_FFLAGS /* use file_pointer->f_flags to indicate purpose for open() */ + CDO_LOCK /* try to lock door if device is opened */ + CDO_CHECK_TYPE /* ensure disc type is data if opened for data */ + +The initial value of this register is +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`, reflecting my own view on user +interface and software standards. Before you protest, there are two +new *ioctl()'s* implemented in `cdrom.c`, that allow you to control the +behavior by software. These are:: + + CDROM_SET_OPTIONS /* set options specified in (int)arg */ + CDROM_CLEAR_OPTIONS /* clear options specified in (int)arg */ + +One option needs some more explanation: *CDO_USE_FFLAGS*. In the next +newsection we explain what the need for this option is. + +A software package `setcd`, available from the Debian distribution +and `sunsite.unc.edu`, allows user level control of these flags. + + +The need to know the purpose of opening the CD-ROM device +========================================================= + +Traditionally, Unix devices can be used in two different `modes`, +either by reading/writing to the device file, or by issuing +controlling commands to the device, by the device's *ioctl()* +call. The problem with CD-ROM drives, is that they can be used for +two entirely different purposes. One is to mount removable +file systems, CD-ROM's, the other is to play audio CD's. Audio commands +are implemented entirely through *ioctl()\'s*, presumably because the +first implementation (SUN?) has been such. In principle there is +nothing wrong with this, but a good control of the `CD player` demands +that the device can **always** be opened in order to give the +*ioctl* commands, regardless of the state the drive is in. + +On the other hand, when used as a removable-media disc drive (what the +original purpose of CD-ROM s is) we would like to make sure that the +disc drive is ready for operation upon opening the device. In the old +scheme, some CD-ROM drivers don't do any integrity checking, resulting +in a number of i/o errors reported by the VFS to the kernel when an +attempt for mounting a CD-ROM on an empty drive occurs. This is not a +particularly elegant way to find out that there is no CD-ROM inserted; +it more-or-less looks like the old IBM-PC trying to read an empty floppy +drive for a couple of seconds, after which the system complains it +can't read from it. Nowadays we can **sense** the existence of a +removable medium in a drive, and we believe we should exploit that +fact. An integrity check on opening of the device, that verifies the +availability of a CD-ROM and its correct type (data), would be +desirable. + +These two ways of using a CD-ROM drive, principally for data and +secondarily for playing audio discs, have different demands for the +behavior of the *open()* call. Audio use simply wants to open the +device in order to get a file handle which is needed for issuing +*ioctl* commands, while data use wants to open for correct and +reliable data transfer. The only way user programs can indicate what +their *purpose* of opening the device is, is through the *flags* +parameter (see `open(2)`). For CD-ROM devices, these flags aren't +implemented (some drivers implement checking for write-related flags, +but this is not strictly necessary if the device file has correct +permission flags). Most option flags simply don't make sense to +CD-ROM devices: *O_CREAT*, *O_NOCTTY*, *O_TRUNC*, *O_APPEND*, and +*O_SYNC* have no meaning to a CD-ROM. + +We therefore propose to use the flag *O_NONBLOCK* to indicate +that the device is opened just for issuing *ioctl* +commands. Strictly, the meaning of *O_NONBLOCK* is that opening and +subsequent calls to the device don't cause the calling process to +wait. We could interpret this as don't wait until someone has +inserted some valid data-CD-ROM. Thus, our proposal of the +implementation for the *open()* call for CD-ROM s is: + +- If no other flags are set than *O_RDONLY*, the device is opened + for data transfer, and the return value will be 0 only upon successful + initialization of the transfer. The call may even induce some actions + on the CD-ROM, such as closing the tray. +- If the option flag *O_NONBLOCK* is set, opening will always be + successful, unless the whole device doesn't exist. The drive will take + no actions whatsoever. + +And what about standards? +------------------------- + +You might hesitate to accept this proposal as it comes from the +Linux community, and not from some standardizing institute. What +about SUN, SGI, HP and all those other Unix and hardware vendors? +Well, these companies are in the lucky position that they generally +control both the hardware and software of their supported products, +and are large enough to set their own standard. They do not have to +deal with a dozen or more different, competing hardware +configurations\ [#f3]_. + +.. [#f3] + + Incidentally, I think that SUN's approach to mounting CD-ROM s is very + good in origin: under Solaris a volume-daemon automatically mounts a + newly inserted CD-ROM under `/cdrom/**`. + + In my opinion they should have pushed this + further and have **every** CD-ROM on the local area network be + mounted at the similar location, i. e., no matter in which particular + machine you insert a CD-ROM, it will always appear at the same + position in the directory tree, on every system. When I wanted to + implement such a user-program for Linux, I came across the + differences in behavior of the various drivers, and the need for an + *ioctl* informing about media changes. + +We believe that using *O_NONBLOCK* to indicate that a device is being opened +for *ioctl* commands only can be easily introduced in the Linux +community. All the CD-player authors will have to be informed, we can +even send in our own patches to the programs. The use of *O_NONBLOCK* +has most likely no influence on the behavior of the CD-players on +other operating systems than Linux. Finally, a user can always revert +to old behavior by a call to +*ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, CDO_USE_FFLAGS)*. + +The preferred strategy of *open()* +---------------------------------- + +The routines in `cdrom.c` are designed in such a way that run-time +configuration of the behavior of CD-ROM devices (of **any** type) +can be carried out, by the *CDROM_SET/CLEAR_OPTIONS* *ioctls*. Thus, various +modes of operation can be set: + +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK` + This is the default setting. (With *CDO_CHECK_TYPE* it will be better, in + the future.) If the device is not yet opened by any other process, and if + the device is being opened for data (*O_NONBLOCK* is not set) and the + tray is found to be open, an attempt to close the tray is made. Then, + it is verified that a disc is in the drive and, if *CDO_CHECK_TYPE* is + set, that it contains tracks of type `data mode 1`. Only if all tests + are passed is the return value zero. The door is locked to prevent file + system corruption. If the drive is opened for audio (*O_NONBLOCK* is + set), no actions are taken and a value of 0 will be returned. + +`CDO_AUTO_CLOSE | CDO_AUTO_EJECT | CDO_LOCK` + This mimics the behavior of the current sbpcd-driver. The option flags are + ignored, the tray is closed on the first open, if necessary. Similarly, + the tray is opened on the last release, i. e., if a CD-ROM is unmounted, + it is automatically ejected, such that the user can replace it. + +We hope that these option can convince everybody (both driver +maintainers and user program developers) to adopt the new CD-ROM +driver scheme and option flag interpretation. + +Description of routines in `cdrom.c` +==================================== + +Only a few routines in `cdrom.c` are exported to the drivers. In this +new section we will discuss these, as well as the functions that `take +over' the CD-ROM interface to the kernel. The header file belonging +to `cdrom.c` is called `cdrom.h`. Formerly, some of the contents of this +file were placed in the file `ucdrom.h`, but this file has now been +merged back into `cdrom.h`. + +:: + + struct file_operations cdrom_fops + +The contents of this structure were described in cdrom_api_. +A pointer to this structure is assigned to the *fops* field +of the *struct gendisk*. + +:: + + int register_cdrom(struct cdrom_device_info *cdi) + +This function is used in about the same way one registers *cdrom_fops* +with the kernel, the device operations and information structures, +as described in cdrom_api_, should be registered with the +Uniform CD-ROM Driver:: + + register_cdrom(&_info); + + +This function returns zero upon success, and non-zero upon +failure. The structure *_info* should have a pointer to the +driver's *_dops*, as in:: + + struct cdrom_device_info _info = { + _dops; + ... + } + +Note that a driver must have one static structure, *_dops*, while +it may have as many structures *_info* as there are minor devices +active. *Register_cdrom()* builds a linked list from these. + + +:: + + void unregister_cdrom(struct cdrom_device_info *cdi) + +Unregistering device *cdi* with minor number *MINOR(cdi->dev)* removes +the minor device from the list. If it was the last registered minor for +the low-level driver, this disconnects the registered device-operation +routines from the CD-ROM interface. This function returns zero upon +success, and non-zero upon failure. + +:: + + int cdrom_open(struct inode * ip, struct file * fp) + +This function is not called directly by the low-level drivers, it is +listed in the standard *cdrom_fops*. If the VFS opens a file, this +function becomes active. A strategy is implemented in this routine, +taking care of all capabilities and options that are set in the +*cdrom_device_ops* connected to the device. Then, the program flow is +transferred to the device_dependent *open()* call. + +:: + + void cdrom_release(struct inode *ip, struct file *fp) + +This function implements the reverse-logic of *cdrom_open()*, and then +calls the device-dependent *release()* routine. When the use-count has +reached 0, the allocated buffers are flushed by calls to *sync_dev(dev)* +and *invalidate_buffers(dev)*. + + +.. _cdrom_ioctl: + +:: + + int cdrom_ioctl(struct inode *ip, struct file *fp, + unsigned int cmd, unsigned long arg) + +This function handles all the standard *ioctl* requests for CD-ROM +devices in a uniform way. The different calls fall into three +categories: *ioctl()'s* that can be directly implemented by device +operations, ones that are routed through the call *audio_ioctl()*, and +the remaining ones, that are presumable device-dependent. Generally, a +negative return value indicates an error. + +Directly implemented *ioctl()'s* +-------------------------------- + +The following `old` CD-ROM *ioctl()*\ 's are implemented by directly +calling device-operations in *cdrom_device_ops*, if implemented and +not masked: + +`CDROMMULTISESSION` + Requests the last session on a CD-ROM. +`CDROMEJECT` + Open tray. +`CDROMCLOSETRAY` + Close tray. +`CDROMEJECT_SW` + If *arg\not=0*, set behavior to auto-close (close + tray on first open) and auto-eject (eject on last release), otherwise + set behavior to non-moving on *open()* and *release()* calls. +`CDROM_GET_MCN` + Get the Media Catalog Number from a CD. + +*Ioctl*s routed through *audio_ioctl()* +--------------------------------------- + +The following set of *ioctl()'s* are all implemented through a call to +the *cdrom_fops* function *audio_ioctl()*. Memory checks and +allocation are performed in *cdrom_ioctl()*, and also sanitization of +address format (*CDROM_LBA*/*CDROM_MSF*) is done. + +`CDROMSUBCHNL` + Get sub-channel data in argument *arg* of type + `struct cdrom_subchnl *`. +`CDROMREADTOCHDR` + Read Table of Contents header, in *arg* of type + `struct cdrom_tochdr *`. +`CDROMREADTOCENTRY` + Read a Table of Contents entry in *arg* and specified by *arg* + of type `struct cdrom_tocentry *`. +`CDROMPLAYMSF` + Play audio fragment specified in Minute, Second, Frame format, + delimited by *arg* of type `struct cdrom_msf *`. +`CDROMPLAYTRKIND` + Play audio fragment in track-index format delimited by *arg* + of type `struct cdrom_ti *`. +`CDROMVOLCTRL` + Set volume specified by *arg* of type `struct cdrom_volctrl *`. +`CDROMVOLREAD` + Read volume into by *arg* of type `struct cdrom_volctrl *`. +`CDROMSTART` + Spin up disc. +`CDROMSTOP` + Stop playback of audio fragment. +`CDROMPAUSE` + Pause playback of audio fragment. +`CDROMRESUME` + Resume playing. + +New *ioctl()'s* in `cdrom.c` +---------------------------- + +The following *ioctl()'s* have been introduced to allow user programs to +control the behavior of individual CD-ROM devices. New *ioctl* +commands can be identified by the underscores in their names. + +`CDROM_SET_OPTIONS` + Set options specified by *arg*. Returns the option flag register + after modification. Use *arg = \rm0* for reading the current flags. +`CDROM_CLEAR_OPTIONS` + Clear options specified by *arg*. Returns the option flag register + after modification. +`CDROM_SELECT_SPEED` + Select head-rate speed of disc specified as by *arg* in units + of standard cdrom speed (176\,kB/sec raw data or + 150kB/sec file system data). The value 0 means `auto-select`, + i. e., play audio discs at real time and data discs at maximum speed. + The value *arg* is checked against the maximum head rate of the + drive found in the *cdrom_dops*. +`CDROM_SELECT_DISC` + Select disc numbered *arg* from a juke-box. + + First disc is numbered 0. The number *arg* is checked against the + maximum number of discs in the juke-box found in the *cdrom_dops*. +`CDROM_MEDIA_CHANGED` + Returns 1 if a disc has been changed since the last call. + Note that calls to *cdrom_media_changed* by the VFS are treated + by an independent queue, so both mechanisms will detect a + media change once. For juke-boxes, an extra argument *arg* + specifies the slot for which the information is given. The special + value *CDSL_CURRENT* requests that information about the currently + selected slot be returned. +`CDROM_DRIVE_STATUS` + Returns the status of the drive by a call to + *drive_status()*. Return values are defined in cdrom_drive_status_. + Note that this call doesn't return information on the + current playing activity of the drive; this can be polled through + an *ioctl* call to *CDROMSUBCHNL*. For juke-boxes, an extra argument + *arg* specifies the slot for which (possibly limited) information is + given. The special value *CDSL_CURRENT* requests that information + about the currently selected slot be returned. +`CDROM_DISC_STATUS` + Returns the type of the disc currently in the drive. + It should be viewed as a complement to *CDROM_DRIVE_STATUS*. + This *ioctl* can provide *some* information about the current + disc that is inserted in the drive. This functionality used to be + implemented in the low level drivers, but is now carried out + entirely in Uniform CD-ROM Driver. + + The history of development of the CD's use as a carrier medium for + various digital information has lead to many different disc types. + This *ioctl* is useful only in the case that CDs have \emph {only + one} type of data on them. While this is often the case, it is + also very common for CDs to have some tracks with data, and some + tracks with audio. Because this is an existing interface, rather + than fixing this interface by changing the assumptions it was made + under, thereby breaking all user applications that use this + function, the Uniform CD-ROM Driver implements this *ioctl* as + follows: If the CD in question has audio tracks on it, and it has + absolutely no CD-I, XA, or data tracks on it, it will be reported + as *CDS_AUDIO*. If it has both audio and data tracks, it will + return *CDS_MIXED*. If there are no audio tracks on the disc, and + if the CD in question has any CD-I tracks on it, it will be + reported as *CDS_XA_2_2*. Failing that, if the CD in question + has any XA tracks on it, it will be reported as *CDS_XA_2_1*. + Finally, if the CD in question has any data tracks on it, + it will be reported as a data CD (*CDS_DATA_1*). + + This *ioctl* can return:: + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, or tray is opened */ + CDS_AUDIO /* Audio disc (2352 audio bytes/frame) */ + CDS_DATA_1 /* data disc, mode 1 (2048 user bytes/frame) */ + CDS_XA_2_1 /* mixed data (XA), mode 2, form 1 (2048 user bytes) */ + CDS_XA_2_2 /* mixed data (XA), mode 2, form 1 (2324 user bytes) */ + CDS_MIXED /* mixed audio/data disc */ + + For some information concerning frame layout of the various disc + types, see a recent version of `cdrom.h`. + +`CDROM_CHANGER_NSLOTS` + Returns the number of slots in a juke-box. +`CDROMRESET` + Reset the drive. +`CDROM_GET_CAPABILITY` + Returns the *capability* flags for the drive. Refer to section + cdrom_capabilities_ for more information on these flags. +`CDROM_LOCKDOOR` + Locks the door of the drive. `arg == 0` unlocks the door, + any other value locks it. +`CDROM_DEBUG` + Turns on debugging info. Only root is allowed to do this. + Same semantics as CDROM_LOCKDOOR. + + +Device dependent *ioctl()'s* +---------------------------- + +Finally, all other *ioctl()'s* are passed to the function *dev_ioctl()*, +if implemented. No memory allocation or verification is carried out. + +How to update your driver +========================= + +- Make a backup of your current driver. +- Get hold of the files `cdrom.c` and `cdrom.h`, they should be in + the directory tree that came with this documentation. +- Make sure you include `cdrom.h`. +- Change the 3rd argument of *register_blkdev* from `&_fops` + to `&cdrom_fops`. +- Just after that line, add the following to register with the Uniform + CD-ROM Driver:: + + register_cdrom(&_info);* + + Similarly, add a call to *unregister_cdrom()* at the appropriate place. +- Copy an example of the device-operations *struct* to your + source, e. g., from `cm206.c` *cm206_dops*, and change all + entries to names corresponding to your driver, or names you just + happen to like. If your driver doesn't support a certain function, + make the entry *NULL*. At the entry *capability* you should list all + capabilities your driver currently supports. If your driver + has a capability that is not listed, please send me a message. +- Copy the *cdrom_device_info* declaration from the same example + driver, and modify the entries according to your needs. If your + driver dynamically determines the capabilities of the hardware, this + structure should also be declared dynamically. +- Implement all functions in your `_dops` structure, + according to prototypes listed in `cdrom.h`, and specifications given + in cdrom_api_. Most likely you have already implemented + the code in a large part, and you will almost certainly need to adapt the + prototype and return values. +- Rename your `_ioctl()` function to *audio_ioctl* and + change the prototype a little. Remove entries listed in the first + part in cdrom_ioctl_, if your code was OK, these are + just calls to the routines you adapted in the previous step. +- You may remove all remaining memory checking code in the + *audio_ioctl()* function that deals with audio commands (these are + listed in the second part of cdrom_ioctl_. There is no + need for memory allocation either, so most *case*s in the *switch* + statement look similar to:: + + case CDROMREADTOCENTRY: + get_toc_entry\bigl((struct cdrom_tocentry *) arg); + +- All remaining *ioctl* cases must be moved to a separate + function, *_ioctl*, the device-dependent *ioctl()'s*. Note that + memory checking and allocation must be kept in this code! +- Change the prototypes of *_open()* and + *_release()*, and remove any strategic code (i. e., tray + movement, door locking, etc.). +- Try to recompile the drivers. We advise you to use modules, both + for `cdrom.o` and your driver, as debugging is much easier this + way. + +Thanks +====== + +Thanks to all the people involved. First, Erik Andersen, who has +taken over the torch in maintaining `cdrom.c` and integrating much +CD-ROM-related code in the 2.1-kernel. Thanks to Scott Snyder and +Gerd Knorr, who were the first to implement this interface for SCSI +and IDE-CD drivers and added many ideas for extension of the data +structures relative to kernel~2.0. Further thanks to Heiko Eißfeldt, +Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard Mönkeberg and Andrew Kroll, +the Linux CD-ROM device driver developers who were kind +enough to give suggestions and criticisms during the writing. Finally +of course, I want to thank Linus Torvalds for making this possible in +the first place. -- cgit From 8ea618899b6b4fbe97c8462e7d769867307de011 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:40 -0300 Subject: docs: cdrom: convert docs to ReST and rename to *.rst The stuff there is almost already at ReST format. A conversion for them is trivial: just add a missing titles and fix some scape codes for them to match ReST syntax. While here, rename the cdrom-standard.txt, with was converted from LaTeX to ReST on the previous patch, and add it to the index file. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/cdrom/cdrom-standard.rst | 1063 ++++++++++++++++++++++++++++++++ Documentation/cdrom/cdrom-standard.txt | 1063 -------------------------------- Documentation/cdrom/ide-cd | 534 ---------------- Documentation/cdrom/ide-cd.rst | 538 ++++++++++++++++ Documentation/cdrom/index.rst | 19 + Documentation/cdrom/packet-writing.rst | 139 +++++ Documentation/cdrom/packet-writing.txt | 132 ---- 7 files changed, 1759 insertions(+), 1729 deletions(-) create mode 100644 Documentation/cdrom/cdrom-standard.rst delete mode 100644 Documentation/cdrom/cdrom-standard.txt delete mode 100644 Documentation/cdrom/ide-cd create mode 100644 Documentation/cdrom/ide-cd.rst create mode 100644 Documentation/cdrom/index.rst create mode 100644 Documentation/cdrom/packet-writing.rst delete mode 100644 Documentation/cdrom/packet-writing.txt (limited to 'Documentation') diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst new file mode 100644 index 000000000000..dde4f7f7fdbf --- /dev/null +++ b/Documentation/cdrom/cdrom-standard.rst @@ -0,0 +1,1063 @@ +======================= +A Linux CD-ROM standard +======================= + +:Author: David van Leeuwen +:Date: 12 March 1999 +:Updated by: Erik Andersen (andersee@debian.org) +:Updated by: Jens Axboe (axboe@image.dk) + + +Introduction +============ + +Linux is probably the Unix-like operating system that supports +the widest variety of hardware devices. The reasons for this are +presumably + +- The large list of hardware devices available for the many platforms + that Linux now supports (i.e., i386-PCs, Sparc Suns, etc.) +- The open design of the operating system, such that anybody can write a + driver for Linux. +- There is plenty of source code around as examples of how to write a driver. + +The openness of Linux, and the many different types of available +hardware has allowed Linux to support many different hardware devices. +Unfortunately, the very openness that has allowed Linux to support +all these different devices has also allowed the behavior of each +device driver to differ significantly from one device to another. +This divergence of behavior has been very significant for CD-ROM +devices; the way a particular drive reacts to a `standard` *ioctl()* +call varies greatly from one device driver to another. To avoid making +their drivers totally inconsistent, the writers of Linux CD-ROM +drivers generally created new device drivers by understanding, copying, +and then changing an existing one. Unfortunately, this practice did not +maintain uniform behavior across all the Linux CD-ROM drivers. + +This document describes an effort to establish Uniform behavior across +all the different CD-ROM device drivers for Linux. This document also +defines the various *ioctl()'s*, and how the low-level CD-ROM device +drivers should implement them. Currently (as of the Linux 2.1.\ *x* +development kernels) several low-level CD-ROM device drivers, including +both IDE/ATAPI and SCSI, now use this Uniform interface. + +When the CD-ROM was developed, the interface between the CD-ROM drive +and the computer was not specified in the standards. As a result, many +different CD-ROM interfaces were developed. Some of them had their +own proprietary design (Sony, Mitsumi, Panasonic, Philips), other +manufacturers adopted an existing electrical interface and changed +the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply +adapted their drives to one or more of the already existing electrical +interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and +most of the `NoName` manufacturers). In cases where a new drive really +brought its own interface or used its own command set and flow control +scheme, either a separate driver had to be written, or an existing +driver had to be enhanced. History has delivered us CD-ROM support for +many of these different interfaces. Nowadays, almost all new CD-ROM +drives are either IDE/ATAPI or SCSI, and it is very unlikely that any +manufacturer will create a new interface. Even finding drives for the +old proprietary interfaces is getting difficult. + +When (in the 1.3.70's) I looked at the existing software interface, +which was expressed through `cdrom.h`, it appeared to be a rather wild +set of commands and data formats [#f1]_. It seemed that many +features of the software interface had been added to accommodate the +capabilities of a particular drive, in an *ad hoc* manner. More +importantly, it appeared that the behavior of the `standard` commands +was different for most of the different drivers: e. g., some drivers +close the tray if an *open()* call occurs when the tray is open, while +others do not. Some drivers lock the door upon opening the device, to +prevent an incoherent file system, but others don't, to allow software +ejection. Undoubtedly, the capabilities of the different drives vary, +but even when two drives have the same capability their drivers' +behavior was usually different. + +.. [#f1] + I cannot recollect what kernel version I looked at, then, + presumably 1.2.13 and 1.3.34 --- the latest kernel that I was + indirectly involved in. + +I decided to start a discussion on how to make all the Linux CD-ROM +drivers behave more uniformly. I began by contacting the developers of +the many CD-ROM drivers found in the Linux kernel. Their reactions +encouraged me to write the Uniform CD-ROM Driver which this document is +intended to describe. The implementation of the Uniform CD-ROM Driver is +in the file `cdrom.c`. This driver is intended to be an additional software +layer that sits on top of the low-level device drivers for each CD-ROM drive. +By adding this additional layer, it is possible to have all the different +CD-ROM devices behave **exactly** the same (insofar as the underlying +hardware will allow). + +The goal of the Uniform CD-ROM Driver is **not** to alienate driver developers +whohave not yet taken steps to support this effort. The goal of Uniform CD-ROM +Driver is simply to give people writing application programs for CD-ROM drives +**one** Linux CD-ROM interface with consistent behavior for all +CD-ROM devices. In addition, this also provides a consistent interface +between the low-level device driver code and the Linux kernel. Care +is taken that 100% compatibility exists with the data structures and +programmer's interface defined in `cdrom.h`. This guide was written to +help CD-ROM driver developers adapt their code to use the Uniform CD-ROM +Driver code defined in `cdrom.c`. + +Personally, I think that the most important hardware interfaces are +the IDE/ATAPI drives and, of course, the SCSI drives, but as prices +of hardware drop continuously, it is also likely that people may have +more than one CD-ROM drive, possibly of mixed types. It is important +that these drives behave in the same way. In December 1994, one of the +cheapest CD-ROM drives was a Philips cm206, a double-speed proprietary +drive. In the months that I was busy writing a Linux driver for it, +proprietary drives became obsolete and IDE/ATAPI drives became the +standard. At the time of the last update to this document (November +1997) it is becoming difficult to even **find** anything less than a +16 speed CD-ROM drive, and 24 speed drives are common. + +.. _cdrom_api: + +Standardizing through another software level +============================================ + +At the time this document was conceived, all drivers directly +implemented the CD-ROM *ioctl()* calls through their own routines. This +led to the danger of different drivers forgetting to do important things +like checking that the user was giving the driver valid data. More +importantly, this led to the divergence of behavior, which has already +been discussed. + +For this reason, the Uniform CD-ROM Driver was created to enforce consistent +CD-ROM drive behavior, and to provide a common set of services to the various +low-level CD-ROM device drivers. The Uniform CD-ROM Driver now provides another +software-level, that separates the *ioctl()* and *open()* implementation +from the actual hardware implementation. Note that this effort has +made few changes which will affect a user's application programs. The +greatest change involved moving the contents of the various low-level +CD-ROM drivers\' header files to the kernel's cdrom directory. This was +done to help ensure that the user is only presented with only one cdrom +interface, the interface defined in `cdrom.h`. + +CD-ROM drives are specific enough (i. e., different from other +block-devices such as floppy or hard disc drives), to define a set +of common **CD-ROM device operations**, *_dops*. +These operations are different from the classical block-device file +operations, *_fops*. + +The routines for the Uniform CD-ROM Driver interface level are implemented +in the file `cdrom.c`. In this file, the Uniform CD-ROM Driver interfaces +with the kernel as a block device by registering the following general +*struct file_operations*:: + + struct file_operations cdrom_fops = { + NULL, /∗ lseek ∗/ + block _read , /∗ read—general block-dev read ∗/ + block _write, /∗ write—general block-dev write ∗/ + NULL, /∗ readdir ∗/ + NULL, /∗ select ∗/ + cdrom_ioctl, /∗ ioctl ∗/ + NULL, /∗ mmap ∗/ + cdrom_open, /∗ open ∗/ + cdrom_release, /∗ release ∗/ + NULL, /∗ fsync ∗/ + NULL, /∗ fasync ∗/ + cdrom_media_changed, /∗ media change ∗/ + NULL /∗ revalidate ∗/ + }; + +Every active CD-ROM device shares this *struct*. The routines +declared above are all implemented in `cdrom.c`, since this file is the +place where the behavior of all CD-ROM-devices is defined and +standardized. The actual interface to the various types of CD-ROM +hardware is still performed by various low-level CD-ROM-device +drivers. These routines simply implement certain **capabilities** +that are common to all CD-ROM (and really, all removable-media +devices). + +Registration of a low-level CD-ROM device driver is now done through +the general routines in `cdrom.c`, not through the Virtual File System +(VFS) any more. The interface implemented in `cdrom.c` is carried out +through two general structures that contain information about the +capabilities of the driver, and the specific drives on which the +driver operates. The structures are: + +cdrom_device_ops + This structure contains information about the low-level driver for a + CD-ROM device. This structure is conceptually connected to the major + number of the device (although some drivers may have different + major numbers, as is the case for the IDE driver). + +cdrom_device_info + This structure contains information about a particular CD-ROM drive, + such as its device name, speed, etc. This structure is conceptually + connected to the minor number of the device. + +Registering a particular CD-ROM drive with the Uniform CD-ROM Driver +is done by the low-level device driver though a call to:: + + register_cdrom(struct cdrom_device_info * _info) + +The device information structure, *_info*, contains all the +information needed for the kernel to interface with the low-level +CD-ROM device driver. One of the most important entries in this +structure is a pointer to the *cdrom_device_ops* structure of the +low-level driver. + +The device operations structure, *cdrom_device_ops*, contains a list +of pointers to the functions which are implemented in the low-level +device driver. When `cdrom.c` accesses a CD-ROM device, it does it +through the functions in this structure. It is impossible to know all +the capabilities of future CD-ROM drives, so it is expected that this +list may need to be expanded from time to time as new technologies are +developed. For example, CD-R and CD-R/W drives are beginning to become +popular, and support will soon need to be added for them. For now, the +current *struct* is:: + + struct cdrom_device_ops { + int (*open)(struct cdrom_device_info *, int) + void (*release)(struct cdrom_device_info *); + int (*drive_status)(struct cdrom_device_info *, int); + unsigned int (*check_events)(struct cdrom_device_info *, + unsigned int, int); + int (*media_changed)(struct cdrom_device_info *, int); + int (*tray_move)(struct cdrom_device_info *, int); + int (*lock_door)(struct cdrom_device_info *, int); + int (*select_speed)(struct cdrom_device_info *, int); + int (*select_disc)(struct cdrom_device_info *, int); + int (*get_last_session) (struct cdrom_device_info *, + struct cdrom_multisession *); + int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); + int (*reset)(struct cdrom_device_info *); + int (*audio_ioctl)(struct cdrom_device_info *, + unsigned int, void *); + const int capability; /* capability flags */ + int (*generic_packet)(struct cdrom_device_info *, + struct packet_command *); + }; + +When a low-level device driver implements one of these capabilities, +it should add a function pointer to this *struct*. When a particular +function is not implemented, however, this *struct* should contain a +NULL instead. The *capability* flags specify the capabilities of the +CD-ROM hardware and/or low-level CD-ROM driver when a CD-ROM drive +is registered with the Uniform CD-ROM Driver. + +Note that most functions have fewer parameters than their +*blkdev_fops* counterparts. This is because very little of the +information in the structures *inode* and *file* is used. For most +drivers, the main parameter is the *struct* *cdrom_device_info*, from +which the major and minor number can be extracted. (Most low-level +CD-ROM drivers don't even look at the major and minor number though, +since many of them only support one device.) This will be available +through *dev* in *cdrom_device_info* described below. + +The drive-specific, minor-like information that is registered with +`cdrom.c`, currently contains the following fields:: + + struct cdrom_device_info { + const struct cdrom_device_ops * ops; /* device operations for this major */ + struct list_head list; /* linked list of all device_info */ + struct gendisk * disk; /* matching block layer disk */ + void * handle; /* driver-dependent data */ + + int mask; /* mask of capability: disables them */ + int speed; /* maximum speed for reading data */ + int capacity; /* number of discs in a jukebox */ + + unsigned int options:30; /* options flags */ + unsigned mc_flags:2; /* media-change buffer flags */ + unsigned int vfs_events; /* cached events for vfs path */ + unsigned int ioctl_events; /* cached events for ioctl path */ + int use_count; /* number of times device is opened */ + char name[20]; /* name of the device type */ + + __u8 sanyo_slot : 2; /* Sanyo 3-CD changer support */ + __u8 keeplocked : 1; /* CDROM_LOCKDOOR status */ + __u8 reserved : 5; /* not used yet */ + int cdda_method; /* see CDDA_* flags */ + __u8 last_sense; /* saves last sense key */ + __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ + unsigned short mmc3_profile; /* current MMC3 profile */ + int for_data; /* unknown:TBD */ + int (*exit)(struct cdrom_device_info *);/* unknown:TBD */ + int mrw_mode_page; /* which MRW mode page is in use */ + }; + +Using this *struct*, a linked list of the registered minor devices is +built, using the *next* field. The device number, the device operations +struct and specifications of properties of the drive are stored in this +structure. + +The *mask* flags can be used to mask out some of the capabilities listed +in *ops->capability*, if a specific drive doesn't support a feature +of the driver. The value *speed* specifies the maximum head-rate of the +drive, measured in units of normal audio speed (176kB/sec raw data or +150kB/sec file system data). The parameters are declared *const* +because they describe properties of the drive, which don't change after +registration. + +A few registers contain variables local to the CD-ROM drive. The +flags *options* are used to specify how the general CD-ROM routines +should behave. These various flags registers should provide enough +flexibility to adapt to the different users' wishes (and **not** the +`arbitrary` wishes of the author of the low-level device driver, as is +the case in the old scheme). The register *mc_flags* is used to buffer +the information from *media_changed()* to two separate queues. Other +data that is specific to a minor drive, can be accessed through *handle*, +which can point to a data structure specific to the low-level driver. +The fields *use_count*, *next*, *options* and *mc_flags* need not be +initialized. + +The intermediate software layer that `cdrom.c` forms will perform some +additional bookkeeping. The use count of the device (the number of +processes that have the device opened) is registered in *use_count*. The +function *cdrom_ioctl()* will verify the appropriate user-memory regions +for read and write, and in case a location on the CD is transferred, +it will `sanitize` the format by making requests to the low-level +drivers in a standard format, and translating all formats between the +user-software and low level drivers. This relieves much of the drivers' +memory checking and format checking and translation. Also, the necessary +structures will be declared on the program stack. + +The implementation of the functions should be as defined in the +following sections. Two functions **must** be implemented, namely +*open()* and *release()*. Other functions may be omitted, their +corresponding capability flags will be cleared upon registration. +Generally, a function returns zero on success and negative on error. A +function call should return only after the command has completed, but of +course waiting for the device should not use processor time. + +:: + + int open(struct cdrom_device_info *cdi, int purpose) + +*Open()* should try to open the device for a specific *purpose*, which +can be either: + +- Open for reading data, as done by `mount()` (2), or the + user commands `dd` or `cat`. +- Open for *ioctl* commands, as done by audio-CD playing programs. + +Notice that any strategic code (closing tray upon *open()*, etc.) is +done by the calling routine in `cdrom.c`, so the low-level routine +should only be concerned with proper initialization, such as spinning +up the disc, etc. + +:: + + void release(struct cdrom_device_info *cdi) + +Device-specific actions should be taken such as spinning down the device. +However, strategic actions such as ejection of the tray, or unlocking +the door, should be left over to the general routine *cdrom_release()*. +This is the only function returning type *void*. + +.. _cdrom_drive_status: + +:: + + int drive_status(struct cdrom_device_info *cdi, int slot_nr) + +The function *drive_status*, if implemented, should provide +information on the status of the drive (not the status of the disc, +which may or may not be in the drive). If the drive is not a changer, +*slot_nr* should be ignored. In `cdrom.h` the possibilities are listed:: + + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, tray is closed */ + CDS_TRAY_OPEN /* tray is opened */ + CDS_DRIVE_NOT_READY /* something is wrong, tray is moving? */ + CDS_DISC_OK /* a disc is loaded and everything is fine */ + +:: + + int media_changed(struct cdrom_device_info *cdi, int disc_nr) + +This function is very similar to the original function in $struct +file_operations*. It returns 1 if the medium of the device *cdi->dev* +has changed since the last call, and 0 otherwise. The parameter +*disc_nr* identifies a specific slot in a juke-box, it should be +ignored for single-disc drives. Note that by `re-routing` this +function through *cdrom_media_changed()*, we can implement separate +queues for the VFS and a new *ioctl()* function that can report device +changes to software (e. g., an auto-mounting daemon). + +:: + + int tray_move(struct cdrom_device_info *cdi, int position) + +This function, if implemented, should control the tray movement. (No +other function should control this.) The parameter *position* controls +the desired direction of movement: + +- 0 Close tray +- 1 Open tray + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the tray is already in the desired position, no +action need be taken, and the return value should be 0. + +:: + + int lock_door(struct cdrom_device_info *cdi, int lock) + +This function (and no other code) controls locking of the door, if the +drive allows this. The value of *lock* controls the desired locking +state: + +- 0 Unlock door, manual opening is allowed +- 1 Lock door, tray cannot be ejected manually + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the door is already in the requested state, no +action need be taken, and the return value should be 0. + +:: + + int select_speed(struct cdrom_device_info *cdi, int speed) + +Some CD-ROM drives are capable of changing their head-speed. There +are several reasons for changing the speed of a CD-ROM drive. Badly +pressed CD-ROM s may benefit from less-than-maximum head rate. Modern +CD-ROM drives can obtain very high head rates (up to *24x* is +common). It has been reported that these drives can make reading +errors at these high speeds, reducing the speed can prevent data loss +in these circumstances. Finally, some of these drives can +make an annoyingly loud noise, which a lower speed may reduce. + +This function specifies the speed at which data is read or audio is +played back. The value of *speed* specifies the head-speed of the +drive, measured in units of standard cdrom speed (176kB/sec raw data +or 150kB/sec file system data). So to request that a CD-ROM drive +operate at 300kB/sec you would call the CDROM_SELECT_SPEED *ioctl* +with *speed=2*. The special value `0` means `auto-selection`, i. e., +maximum data-rate or real-time audio rate. If the drive doesn't have +this `auto-selection` capability, the decision should be made on the +current disc loaded and the return value should be positive. A negative +return value indicates an error. + +:: + + int select_disc(struct cdrom_device_info *cdi, int number) + +If the drive can store multiple discs (a juke-box) this function +will perform disc selection. It should return the number of the +selected disc on success, a negative value on error. Currently, only +the ide-cd driver supports this functionality. + +:: + + int get_last_session(struct cdrom_device_info *cdi, + struct cdrom_multisession *ms_info) + +This function should implement the old corresponding *ioctl()*. For +device *cdi->dev*, the start of the last session of the current disc +should be returned in the pointer argument *ms_info*. Note that +routines in `cdrom.c` have sanitized this argument: its requested +format will **always** be of the type *CDROM_LBA* (linear block +addressing mode), whatever the calling software requested. But +sanitization goes even further: the low-level implementation may +return the requested information in *CDROM_MSF* format if it wishes so +(setting the *ms_info->addr_format* field appropriately, of +course) and the routines in `cdrom.c` will make the transformation if +necessary. The return value is 0 upon success. + +:: + + int get_mcn(struct cdrom_device_info *cdi, + struct cdrom_mcn *mcn) + +Some discs carry a `Media Catalog Number` (MCN), also called +`Universal Product Code` (UPC). This number should reflect the number +that is generally found in the bar-code on the product. Unfortunately, +the few discs that carry such a number on the disc don't even use the +same format. The return argument to this function is a pointer to a +pre-declared memory region of type *struct cdrom_mcn*. The MCN is +expected as a 13-character string, terminated by a null-character. + +:: + + int reset(struct cdrom_device_info *cdi) + +This call should perform a hard-reset on the drive (although in +circumstances that a hard-reset is necessary, a drive may very well not +listen to commands anymore). Preferably, control is returned to the +caller only after the drive has finished resetting. If the drive is no +longer listening, it may be wise for the underlying low-level cdrom +driver to time out. + +:: + + int audio_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, void *arg) + +Some of the CD-ROM-\ *ioctl()*\ 's defined in `cdrom.h` can be +implemented by the routines described above, and hence the function +*cdrom_ioctl* will use those. However, most *ioctl()*\ 's deal with +audio-control. We have decided to leave these to be accessed through a +single function, repeating the arguments *cmd* and *arg*. Note that +the latter is of type *void*, rather than *unsigned long int*. +The routine *cdrom_ioctl()* does do some useful things, +though. It sanitizes the address format type to *CDROM_MSF* (Minutes, +Seconds, Frames) for all audio calls. It also verifies the memory +location of *arg*, and reserves stack-memory for the argument. This +makes implementation of the *audio_ioctl()* much simpler than in the +old driver scheme. For example, you may look up the function +*cm206_audio_ioctl()* `cm206.c` that should be updated with +this documentation. + +An unimplemented ioctl should return *-ENOSYS*, but a harmless request +(e. g., *CDROMSTART*) may be ignored by returning 0 (success). Other +errors should be according to the standards, whatever they are. When +an error is returned by the low-level driver, the Uniform CD-ROM Driver +tries whenever possible to return the error code to the calling program. +(We may decide to sanitize the return value in *cdrom_ioctl()* though, in +order to guarantee a uniform interface to the audio-player software.) + +:: + + int dev_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, unsigned long arg) + +Some *ioctl()'s* seem to be specific to certain CD-ROM drives. That is, +they are introduced to service some capabilities of certain drives. In +fact, there are 6 different *ioctl()'s* for reading data, either in some +particular kind of format, or audio data. Not many drives support +reading audio tracks as data, I believe this is because of protection +of copyrights of artists. Moreover, I think that if audio-tracks are +supported, it should be done through the VFS and not via *ioctl()'s*. A +problem here could be the fact that audio-frames are 2352 bytes long, +so either the audio-file-system should ask for 75264 bytes at once +(the least common multiple of 512 and 2352), or the drivers should +bend their backs to cope with this incoherence (to which I would be +opposed). Furthermore, it is very difficult for the hardware to find +the exact frame boundaries, since there are no synchronization headers +in audio frames. Once these issues are resolved, this code should be +standardized in `cdrom.c`. + +Because there are so many *ioctl()'s* that seem to be introduced to +satisfy certain drivers [#f2]_, any non-standard *ioctl()*\ s +are routed through the call *dev_ioctl()*. In principle, `private` +*ioctl()*\ 's should be numbered after the device's major number, and not +the general CD-ROM *ioctl* number, `0x53`. Currently the +non-supported *ioctl()'s* are: + + CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW, + CDROMREADCOOKED, CDROMSEEK, CDROMPLAY-BLK and CDROM-READALL + +.. [#f2] + + Is there software around that actually uses these? I'd be interested! + +.. _cdrom_capabilities: + +CD-ROM capabilities +------------------- + +Instead of just implementing some *ioctl* calls, the interface in +`cdrom.c` supplies the possibility to indicate the **capabilities** +of a CD-ROM drive. This can be done by ORing any number of +capability-constants that are defined in `cdrom.h` at the registration +phase. Currently, the capabilities are any of:: + + CDC_CLOSE_TRAY /* can close tray by software control */ + CDC_OPEN_TRAY /* can open tray */ + CDC_LOCK /* can lock and unlock the door */ + CDC_SELECT_SPEED /* can select speed, in units of * sim*150 ,kB/s */ + CDC_SELECT_DISC /* drive is juke-box */ + CDC_MULTI_SESSION /* can read sessions *> rm1* */ + CDC_MCN /* can read Media Catalog Number */ + CDC_MEDIA_CHANGED /* can report if disc has changed */ + CDC_PLAY_AUDIO /* can perform audio-functions (play, pause, etc) */ + CDC_RESET /* hard reset device */ + CDC_IOCTLS /* driver has non-standard ioctls */ + CDC_DRIVE_STATUS /* driver implements drive status */ + +The capability flag is declared *const*, to prevent drivers from +accidentally tampering with the contents. The capability fags actually +inform `cdrom.c` of what the driver can do. If the drive found +by the driver does not have the capability, is can be masked out by +the *cdrom_device_info* variable *mask*. For instance, the SCSI CD-ROM +driver has implemented the code for loading and ejecting CD-ROM's, and +hence its corresponding flags in *capability* will be set. But a SCSI +CD-ROM drive might be a caddy system, which can't load the tray, and +hence for this drive the *cdrom_device_info* struct will have set +the *CDC_CLOSE_TRAY* bit in *mask*. + +In the file `cdrom.c` you will encounter many constructions of the type:: + + if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ... + +There is no *ioctl* to set the mask... The reason is that +I think it is better to control the **behavior** rather than the +**capabilities**. + +Options +------- + +A final flag register controls the **behavior** of the CD-ROM +drives, in order to satisfy different users' wishes, hopefully +independently of the ideas of the respective author who happened to +have made the drive's support available to the Linux community. The +current behavior options are:: + + CDO_AUTO_CLOSE /* try to close tray upon device open() */ + CDO_AUTO_EJECT /* try to open tray on last device close() */ + CDO_USE_FFLAGS /* use file_pointer->f_flags to indicate purpose for open() */ + CDO_LOCK /* try to lock door if device is opened */ + CDO_CHECK_TYPE /* ensure disc type is data if opened for data */ + +The initial value of this register is +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`, reflecting my own view on user +interface and software standards. Before you protest, there are two +new *ioctl()'s* implemented in `cdrom.c`, that allow you to control the +behavior by software. These are:: + + CDROM_SET_OPTIONS /* set options specified in (int)arg */ + CDROM_CLEAR_OPTIONS /* clear options specified in (int)arg */ + +One option needs some more explanation: *CDO_USE_FFLAGS*. In the next +newsection we explain what the need for this option is. + +A software package `setcd`, available from the Debian distribution +and `sunsite.unc.edu`, allows user level control of these flags. + + +The need to know the purpose of opening the CD-ROM device +========================================================= + +Traditionally, Unix devices can be used in two different `modes`, +either by reading/writing to the device file, or by issuing +controlling commands to the device, by the device's *ioctl()* +call. The problem with CD-ROM drives, is that they can be used for +two entirely different purposes. One is to mount removable +file systems, CD-ROM's, the other is to play audio CD's. Audio commands +are implemented entirely through *ioctl()\'s*, presumably because the +first implementation (SUN?) has been such. In principle there is +nothing wrong with this, but a good control of the `CD player` demands +that the device can **always** be opened in order to give the +*ioctl* commands, regardless of the state the drive is in. + +On the other hand, when used as a removable-media disc drive (what the +original purpose of CD-ROM s is) we would like to make sure that the +disc drive is ready for operation upon opening the device. In the old +scheme, some CD-ROM drivers don't do any integrity checking, resulting +in a number of i/o errors reported by the VFS to the kernel when an +attempt for mounting a CD-ROM on an empty drive occurs. This is not a +particularly elegant way to find out that there is no CD-ROM inserted; +it more-or-less looks like the old IBM-PC trying to read an empty floppy +drive for a couple of seconds, after which the system complains it +can't read from it. Nowadays we can **sense** the existence of a +removable medium in a drive, and we believe we should exploit that +fact. An integrity check on opening of the device, that verifies the +availability of a CD-ROM and its correct type (data), would be +desirable. + +These two ways of using a CD-ROM drive, principally for data and +secondarily for playing audio discs, have different demands for the +behavior of the *open()* call. Audio use simply wants to open the +device in order to get a file handle which is needed for issuing +*ioctl* commands, while data use wants to open for correct and +reliable data transfer. The only way user programs can indicate what +their *purpose* of opening the device is, is through the *flags* +parameter (see `open(2)`). For CD-ROM devices, these flags aren't +implemented (some drivers implement checking for write-related flags, +but this is not strictly necessary if the device file has correct +permission flags). Most option flags simply don't make sense to +CD-ROM devices: *O_CREAT*, *O_NOCTTY*, *O_TRUNC*, *O_APPEND*, and +*O_SYNC* have no meaning to a CD-ROM. + +We therefore propose to use the flag *O_NONBLOCK* to indicate +that the device is opened just for issuing *ioctl* +commands. Strictly, the meaning of *O_NONBLOCK* is that opening and +subsequent calls to the device don't cause the calling process to +wait. We could interpret this as don't wait until someone has +inserted some valid data-CD-ROM. Thus, our proposal of the +implementation for the *open()* call for CD-ROM s is: + +- If no other flags are set than *O_RDONLY*, the device is opened + for data transfer, and the return value will be 0 only upon successful + initialization of the transfer. The call may even induce some actions + on the CD-ROM, such as closing the tray. +- If the option flag *O_NONBLOCK* is set, opening will always be + successful, unless the whole device doesn't exist. The drive will take + no actions whatsoever. + +And what about standards? +------------------------- + +You might hesitate to accept this proposal as it comes from the +Linux community, and not from some standardizing institute. What +about SUN, SGI, HP and all those other Unix and hardware vendors? +Well, these companies are in the lucky position that they generally +control both the hardware and software of their supported products, +and are large enough to set their own standard. They do not have to +deal with a dozen or more different, competing hardware +configurations\ [#f3]_. + +.. [#f3] + + Incidentally, I think that SUN's approach to mounting CD-ROM s is very + good in origin: under Solaris a volume-daemon automatically mounts a + newly inserted CD-ROM under `/cdrom/**`. + + In my opinion they should have pushed this + further and have **every** CD-ROM on the local area network be + mounted at the similar location, i. e., no matter in which particular + machine you insert a CD-ROM, it will always appear at the same + position in the directory tree, on every system. When I wanted to + implement such a user-program for Linux, I came across the + differences in behavior of the various drivers, and the need for an + *ioctl* informing about media changes. + +We believe that using *O_NONBLOCK* to indicate that a device is being opened +for *ioctl* commands only can be easily introduced in the Linux +community. All the CD-player authors will have to be informed, we can +even send in our own patches to the programs. The use of *O_NONBLOCK* +has most likely no influence on the behavior of the CD-players on +other operating systems than Linux. Finally, a user can always revert +to old behavior by a call to +*ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, CDO_USE_FFLAGS)*. + +The preferred strategy of *open()* +---------------------------------- + +The routines in `cdrom.c` are designed in such a way that run-time +configuration of the behavior of CD-ROM devices (of **any** type) +can be carried out, by the *CDROM_SET/CLEAR_OPTIONS* *ioctls*. Thus, various +modes of operation can be set: + +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK` + This is the default setting. (With *CDO_CHECK_TYPE* it will be better, in + the future.) If the device is not yet opened by any other process, and if + the device is being opened for data (*O_NONBLOCK* is not set) and the + tray is found to be open, an attempt to close the tray is made. Then, + it is verified that a disc is in the drive and, if *CDO_CHECK_TYPE* is + set, that it contains tracks of type `data mode 1`. Only if all tests + are passed is the return value zero. The door is locked to prevent file + system corruption. If the drive is opened for audio (*O_NONBLOCK* is + set), no actions are taken and a value of 0 will be returned. + +`CDO_AUTO_CLOSE | CDO_AUTO_EJECT | CDO_LOCK` + This mimics the behavior of the current sbpcd-driver. The option flags are + ignored, the tray is closed on the first open, if necessary. Similarly, + the tray is opened on the last release, i. e., if a CD-ROM is unmounted, + it is automatically ejected, such that the user can replace it. + +We hope that these option can convince everybody (both driver +maintainers and user program developers) to adopt the new CD-ROM +driver scheme and option flag interpretation. + +Description of routines in `cdrom.c` +==================================== + +Only a few routines in `cdrom.c` are exported to the drivers. In this +new section we will discuss these, as well as the functions that `take +over' the CD-ROM interface to the kernel. The header file belonging +to `cdrom.c` is called `cdrom.h`. Formerly, some of the contents of this +file were placed in the file `ucdrom.h`, but this file has now been +merged back into `cdrom.h`. + +:: + + struct file_operations cdrom_fops + +The contents of this structure were described in cdrom_api_. +A pointer to this structure is assigned to the *fops* field +of the *struct gendisk*. + +:: + + int register_cdrom(struct cdrom_device_info *cdi) + +This function is used in about the same way one registers *cdrom_fops* +with the kernel, the device operations and information structures, +as described in cdrom_api_, should be registered with the +Uniform CD-ROM Driver:: + + register_cdrom(&_info); + + +This function returns zero upon success, and non-zero upon +failure. The structure *_info* should have a pointer to the +driver's *_dops*, as in:: + + struct cdrom_device_info _info = { + _dops; + ... + } + +Note that a driver must have one static structure, *_dops*, while +it may have as many structures *_info* as there are minor devices +active. *Register_cdrom()* builds a linked list from these. + + +:: + + void unregister_cdrom(struct cdrom_device_info *cdi) + +Unregistering device *cdi* with minor number *MINOR(cdi->dev)* removes +the minor device from the list. If it was the last registered minor for +the low-level driver, this disconnects the registered device-operation +routines from the CD-ROM interface. This function returns zero upon +success, and non-zero upon failure. + +:: + + int cdrom_open(struct inode * ip, struct file * fp) + +This function is not called directly by the low-level drivers, it is +listed in the standard *cdrom_fops*. If the VFS opens a file, this +function becomes active. A strategy is implemented in this routine, +taking care of all capabilities and options that are set in the +*cdrom_device_ops* connected to the device. Then, the program flow is +transferred to the device_dependent *open()* call. + +:: + + void cdrom_release(struct inode *ip, struct file *fp) + +This function implements the reverse-logic of *cdrom_open()*, and then +calls the device-dependent *release()* routine. When the use-count has +reached 0, the allocated buffers are flushed by calls to *sync_dev(dev)* +and *invalidate_buffers(dev)*. + + +.. _cdrom_ioctl: + +:: + + int cdrom_ioctl(struct inode *ip, struct file *fp, + unsigned int cmd, unsigned long arg) + +This function handles all the standard *ioctl* requests for CD-ROM +devices in a uniform way. The different calls fall into three +categories: *ioctl()'s* that can be directly implemented by device +operations, ones that are routed through the call *audio_ioctl()*, and +the remaining ones, that are presumable device-dependent. Generally, a +negative return value indicates an error. + +Directly implemented *ioctl()'s* +-------------------------------- + +The following `old` CD-ROM *ioctl()*\ 's are implemented by directly +calling device-operations in *cdrom_device_ops*, if implemented and +not masked: + +`CDROMMULTISESSION` + Requests the last session on a CD-ROM. +`CDROMEJECT` + Open tray. +`CDROMCLOSETRAY` + Close tray. +`CDROMEJECT_SW` + If *arg\not=0*, set behavior to auto-close (close + tray on first open) and auto-eject (eject on last release), otherwise + set behavior to non-moving on *open()* and *release()* calls. +`CDROM_GET_MCN` + Get the Media Catalog Number from a CD. + +*Ioctl*s routed through *audio_ioctl()* +--------------------------------------- + +The following set of *ioctl()'s* are all implemented through a call to +the *cdrom_fops* function *audio_ioctl()*. Memory checks and +allocation are performed in *cdrom_ioctl()*, and also sanitization of +address format (*CDROM_LBA*/*CDROM_MSF*) is done. + +`CDROMSUBCHNL` + Get sub-channel data in argument *arg* of type + `struct cdrom_subchnl *`. +`CDROMREADTOCHDR` + Read Table of Contents header, in *arg* of type + `struct cdrom_tochdr *`. +`CDROMREADTOCENTRY` + Read a Table of Contents entry in *arg* and specified by *arg* + of type `struct cdrom_tocentry *`. +`CDROMPLAYMSF` + Play audio fragment specified in Minute, Second, Frame format, + delimited by *arg* of type `struct cdrom_msf *`. +`CDROMPLAYTRKIND` + Play audio fragment in track-index format delimited by *arg* + of type `struct cdrom_ti *`. +`CDROMVOLCTRL` + Set volume specified by *arg* of type `struct cdrom_volctrl *`. +`CDROMVOLREAD` + Read volume into by *arg* of type `struct cdrom_volctrl *`. +`CDROMSTART` + Spin up disc. +`CDROMSTOP` + Stop playback of audio fragment. +`CDROMPAUSE` + Pause playback of audio fragment. +`CDROMRESUME` + Resume playing. + +New *ioctl()'s* in `cdrom.c` +---------------------------- + +The following *ioctl()'s* have been introduced to allow user programs to +control the behavior of individual CD-ROM devices. New *ioctl* +commands can be identified by the underscores in their names. + +`CDROM_SET_OPTIONS` + Set options specified by *arg*. Returns the option flag register + after modification. Use *arg = \rm0* for reading the current flags. +`CDROM_CLEAR_OPTIONS` + Clear options specified by *arg*. Returns the option flag register + after modification. +`CDROM_SELECT_SPEED` + Select head-rate speed of disc specified as by *arg* in units + of standard cdrom speed (176\,kB/sec raw data or + 150kB/sec file system data). The value 0 means `auto-select`, + i. e., play audio discs at real time and data discs at maximum speed. + The value *arg* is checked against the maximum head rate of the + drive found in the *cdrom_dops*. +`CDROM_SELECT_DISC` + Select disc numbered *arg* from a juke-box. + + First disc is numbered 0. The number *arg* is checked against the + maximum number of discs in the juke-box found in the *cdrom_dops*. +`CDROM_MEDIA_CHANGED` + Returns 1 if a disc has been changed since the last call. + Note that calls to *cdrom_media_changed* by the VFS are treated + by an independent queue, so both mechanisms will detect a + media change once. For juke-boxes, an extra argument *arg* + specifies the slot for which the information is given. The special + value *CDSL_CURRENT* requests that information about the currently + selected slot be returned. +`CDROM_DRIVE_STATUS` + Returns the status of the drive by a call to + *drive_status()*. Return values are defined in cdrom_drive_status_. + Note that this call doesn't return information on the + current playing activity of the drive; this can be polled through + an *ioctl* call to *CDROMSUBCHNL*. For juke-boxes, an extra argument + *arg* specifies the slot for which (possibly limited) information is + given. The special value *CDSL_CURRENT* requests that information + about the currently selected slot be returned. +`CDROM_DISC_STATUS` + Returns the type of the disc currently in the drive. + It should be viewed as a complement to *CDROM_DRIVE_STATUS*. + This *ioctl* can provide *some* information about the current + disc that is inserted in the drive. This functionality used to be + implemented in the low level drivers, but is now carried out + entirely in Uniform CD-ROM Driver. + + The history of development of the CD's use as a carrier medium for + various digital information has lead to many different disc types. + This *ioctl* is useful only in the case that CDs have \emph {only + one} type of data on them. While this is often the case, it is + also very common for CDs to have some tracks with data, and some + tracks with audio. Because this is an existing interface, rather + than fixing this interface by changing the assumptions it was made + under, thereby breaking all user applications that use this + function, the Uniform CD-ROM Driver implements this *ioctl* as + follows: If the CD in question has audio tracks on it, and it has + absolutely no CD-I, XA, or data tracks on it, it will be reported + as *CDS_AUDIO*. If it has both audio and data tracks, it will + return *CDS_MIXED*. If there are no audio tracks on the disc, and + if the CD in question has any CD-I tracks on it, it will be + reported as *CDS_XA_2_2*. Failing that, if the CD in question + has any XA tracks on it, it will be reported as *CDS_XA_2_1*. + Finally, if the CD in question has any data tracks on it, + it will be reported as a data CD (*CDS_DATA_1*). + + This *ioctl* can return:: + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, or tray is opened */ + CDS_AUDIO /* Audio disc (2352 audio bytes/frame) */ + CDS_DATA_1 /* data disc, mode 1 (2048 user bytes/frame) */ + CDS_XA_2_1 /* mixed data (XA), mode 2, form 1 (2048 user bytes) */ + CDS_XA_2_2 /* mixed data (XA), mode 2, form 1 (2324 user bytes) */ + CDS_MIXED /* mixed audio/data disc */ + + For some information concerning frame layout of the various disc + types, see a recent version of `cdrom.h`. + +`CDROM_CHANGER_NSLOTS` + Returns the number of slots in a juke-box. +`CDROMRESET` + Reset the drive. +`CDROM_GET_CAPABILITY` + Returns the *capability* flags for the drive. Refer to section + cdrom_capabilities_ for more information on these flags. +`CDROM_LOCKDOOR` + Locks the door of the drive. `arg == 0` unlocks the door, + any other value locks it. +`CDROM_DEBUG` + Turns on debugging info. Only root is allowed to do this. + Same semantics as CDROM_LOCKDOOR. + + +Device dependent *ioctl()'s* +---------------------------- + +Finally, all other *ioctl()'s* are passed to the function *dev_ioctl()*, +if implemented. No memory allocation or verification is carried out. + +How to update your driver +========================= + +- Make a backup of your current driver. +- Get hold of the files `cdrom.c` and `cdrom.h`, they should be in + the directory tree that came with this documentation. +- Make sure you include `cdrom.h`. +- Change the 3rd argument of *register_blkdev* from `&_fops` + to `&cdrom_fops`. +- Just after that line, add the following to register with the Uniform + CD-ROM Driver:: + + register_cdrom(&_info);* + + Similarly, add a call to *unregister_cdrom()* at the appropriate place. +- Copy an example of the device-operations *struct* to your + source, e. g., from `cm206.c` *cm206_dops*, and change all + entries to names corresponding to your driver, or names you just + happen to like. If your driver doesn't support a certain function, + make the entry *NULL*. At the entry *capability* you should list all + capabilities your driver currently supports. If your driver + has a capability that is not listed, please send me a message. +- Copy the *cdrom_device_info* declaration from the same example + driver, and modify the entries according to your needs. If your + driver dynamically determines the capabilities of the hardware, this + structure should also be declared dynamically. +- Implement all functions in your `_dops` structure, + according to prototypes listed in `cdrom.h`, and specifications given + in cdrom_api_. Most likely you have already implemented + the code in a large part, and you will almost certainly need to adapt the + prototype and return values. +- Rename your `_ioctl()` function to *audio_ioctl* and + change the prototype a little. Remove entries listed in the first + part in cdrom_ioctl_, if your code was OK, these are + just calls to the routines you adapted in the previous step. +- You may remove all remaining memory checking code in the + *audio_ioctl()* function that deals with audio commands (these are + listed in the second part of cdrom_ioctl_. There is no + need for memory allocation either, so most *case*s in the *switch* + statement look similar to:: + + case CDROMREADTOCENTRY: + get_toc_entry\bigl((struct cdrom_tocentry *) arg); + +- All remaining *ioctl* cases must be moved to a separate + function, *_ioctl*, the device-dependent *ioctl()'s*. Note that + memory checking and allocation must be kept in this code! +- Change the prototypes of *_open()* and + *_release()*, and remove any strategic code (i. e., tray + movement, door locking, etc.). +- Try to recompile the drivers. We advise you to use modules, both + for `cdrom.o` and your driver, as debugging is much easier this + way. + +Thanks +====== + +Thanks to all the people involved. First, Erik Andersen, who has +taken over the torch in maintaining `cdrom.c` and integrating much +CD-ROM-related code in the 2.1-kernel. Thanks to Scott Snyder and +Gerd Knorr, who were the first to implement this interface for SCSI +and IDE-CD drivers and added many ideas for extension of the data +structures relative to kernel~2.0. Further thanks to Heiko Eißfeldt, +Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard Mönkeberg and Andrew Kroll, +the Linux CD-ROM device driver developers who were kind +enough to give suggestions and criticisms during the writing. Finally +of course, I want to thank Linus Torvalds for making this possible in +the first place. diff --git a/Documentation/cdrom/cdrom-standard.txt b/Documentation/cdrom/cdrom-standard.txt deleted file mode 100644 index dde4f7f7fdbf..000000000000 --- a/Documentation/cdrom/cdrom-standard.txt +++ /dev/null @@ -1,1063 +0,0 @@ -======================= -A Linux CD-ROM standard -======================= - -:Author: David van Leeuwen -:Date: 12 March 1999 -:Updated by: Erik Andersen (andersee@debian.org) -:Updated by: Jens Axboe (axboe@image.dk) - - -Introduction -============ - -Linux is probably the Unix-like operating system that supports -the widest variety of hardware devices. The reasons for this are -presumably - -- The large list of hardware devices available for the many platforms - that Linux now supports (i.e., i386-PCs, Sparc Suns, etc.) -- The open design of the operating system, such that anybody can write a - driver for Linux. -- There is plenty of source code around as examples of how to write a driver. - -The openness of Linux, and the many different types of available -hardware has allowed Linux to support many different hardware devices. -Unfortunately, the very openness that has allowed Linux to support -all these different devices has also allowed the behavior of each -device driver to differ significantly from one device to another. -This divergence of behavior has been very significant for CD-ROM -devices; the way a particular drive reacts to a `standard` *ioctl()* -call varies greatly from one device driver to another. To avoid making -their drivers totally inconsistent, the writers of Linux CD-ROM -drivers generally created new device drivers by understanding, copying, -and then changing an existing one. Unfortunately, this practice did not -maintain uniform behavior across all the Linux CD-ROM drivers. - -This document describes an effort to establish Uniform behavior across -all the different CD-ROM device drivers for Linux. This document also -defines the various *ioctl()'s*, and how the low-level CD-ROM device -drivers should implement them. Currently (as of the Linux 2.1.\ *x* -development kernels) several low-level CD-ROM device drivers, including -both IDE/ATAPI and SCSI, now use this Uniform interface. - -When the CD-ROM was developed, the interface between the CD-ROM drive -and the computer was not specified in the standards. As a result, many -different CD-ROM interfaces were developed. Some of them had their -own proprietary design (Sony, Mitsumi, Panasonic, Philips), other -manufacturers adopted an existing electrical interface and changed -the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply -adapted their drives to one or more of the already existing electrical -interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and -most of the `NoName` manufacturers). In cases where a new drive really -brought its own interface or used its own command set and flow control -scheme, either a separate driver had to be written, or an existing -driver had to be enhanced. History has delivered us CD-ROM support for -many of these different interfaces. Nowadays, almost all new CD-ROM -drives are either IDE/ATAPI or SCSI, and it is very unlikely that any -manufacturer will create a new interface. Even finding drives for the -old proprietary interfaces is getting difficult. - -When (in the 1.3.70's) I looked at the existing software interface, -which was expressed through `cdrom.h`, it appeared to be a rather wild -set of commands and data formats [#f1]_. It seemed that many -features of the software interface had been added to accommodate the -capabilities of a particular drive, in an *ad hoc* manner. More -importantly, it appeared that the behavior of the `standard` commands -was different for most of the different drivers: e. g., some drivers -close the tray if an *open()* call occurs when the tray is open, while -others do not. Some drivers lock the door upon opening the device, to -prevent an incoherent file system, but others don't, to allow software -ejection. Undoubtedly, the capabilities of the different drives vary, -but even when two drives have the same capability their drivers' -behavior was usually different. - -.. [#f1] - I cannot recollect what kernel version I looked at, then, - presumably 1.2.13 and 1.3.34 --- the latest kernel that I was - indirectly involved in. - -I decided to start a discussion on how to make all the Linux CD-ROM -drivers behave more uniformly. I began by contacting the developers of -the many CD-ROM drivers found in the Linux kernel. Their reactions -encouraged me to write the Uniform CD-ROM Driver which this document is -intended to describe. The implementation of the Uniform CD-ROM Driver is -in the file `cdrom.c`. This driver is intended to be an additional software -layer that sits on top of the low-level device drivers for each CD-ROM drive. -By adding this additional layer, it is possible to have all the different -CD-ROM devices behave **exactly** the same (insofar as the underlying -hardware will allow). - -The goal of the Uniform CD-ROM Driver is **not** to alienate driver developers -whohave not yet taken steps to support this effort. The goal of Uniform CD-ROM -Driver is simply to give people writing application programs for CD-ROM drives -**one** Linux CD-ROM interface with consistent behavior for all -CD-ROM devices. In addition, this also provides a consistent interface -between the low-level device driver code and the Linux kernel. Care -is taken that 100% compatibility exists with the data structures and -programmer's interface defined in `cdrom.h`. This guide was written to -help CD-ROM driver developers adapt their code to use the Uniform CD-ROM -Driver code defined in `cdrom.c`. - -Personally, I think that the most important hardware interfaces are -the IDE/ATAPI drives and, of course, the SCSI drives, but as prices -of hardware drop continuously, it is also likely that people may have -more than one CD-ROM drive, possibly of mixed types. It is important -that these drives behave in the same way. In December 1994, one of the -cheapest CD-ROM drives was a Philips cm206, a double-speed proprietary -drive. In the months that I was busy writing a Linux driver for it, -proprietary drives became obsolete and IDE/ATAPI drives became the -standard. At the time of the last update to this document (November -1997) it is becoming difficult to even **find** anything less than a -16 speed CD-ROM drive, and 24 speed drives are common. - -.. _cdrom_api: - -Standardizing through another software level -============================================ - -At the time this document was conceived, all drivers directly -implemented the CD-ROM *ioctl()* calls through their own routines. This -led to the danger of different drivers forgetting to do important things -like checking that the user was giving the driver valid data. More -importantly, this led to the divergence of behavior, which has already -been discussed. - -For this reason, the Uniform CD-ROM Driver was created to enforce consistent -CD-ROM drive behavior, and to provide a common set of services to the various -low-level CD-ROM device drivers. The Uniform CD-ROM Driver now provides another -software-level, that separates the *ioctl()* and *open()* implementation -from the actual hardware implementation. Note that this effort has -made few changes which will affect a user's application programs. The -greatest change involved moving the contents of the various low-level -CD-ROM drivers\' header files to the kernel's cdrom directory. This was -done to help ensure that the user is only presented with only one cdrom -interface, the interface defined in `cdrom.h`. - -CD-ROM drives are specific enough (i. e., different from other -block-devices such as floppy or hard disc drives), to define a set -of common **CD-ROM device operations**, *_dops*. -These operations are different from the classical block-device file -operations, *_fops*. - -The routines for the Uniform CD-ROM Driver interface level are implemented -in the file `cdrom.c`. In this file, the Uniform CD-ROM Driver interfaces -with the kernel as a block device by registering the following general -*struct file_operations*:: - - struct file_operations cdrom_fops = { - NULL, /∗ lseek ∗/ - block _read , /∗ read—general block-dev read ∗/ - block _write, /∗ write—general block-dev write ∗/ - NULL, /∗ readdir ∗/ - NULL, /∗ select ∗/ - cdrom_ioctl, /∗ ioctl ∗/ - NULL, /∗ mmap ∗/ - cdrom_open, /∗ open ∗/ - cdrom_release, /∗ release ∗/ - NULL, /∗ fsync ∗/ - NULL, /∗ fasync ∗/ - cdrom_media_changed, /∗ media change ∗/ - NULL /∗ revalidate ∗/ - }; - -Every active CD-ROM device shares this *struct*. The routines -declared above are all implemented in `cdrom.c`, since this file is the -place where the behavior of all CD-ROM-devices is defined and -standardized. The actual interface to the various types of CD-ROM -hardware is still performed by various low-level CD-ROM-device -drivers. These routines simply implement certain **capabilities** -that are common to all CD-ROM (and really, all removable-media -devices). - -Registration of a low-level CD-ROM device driver is now done through -the general routines in `cdrom.c`, not through the Virtual File System -(VFS) any more. The interface implemented in `cdrom.c` is carried out -through two general structures that contain information about the -capabilities of the driver, and the specific drives on which the -driver operates. The structures are: - -cdrom_device_ops - This structure contains information about the low-level driver for a - CD-ROM device. This structure is conceptually connected to the major - number of the device (although some drivers may have different - major numbers, as is the case for the IDE driver). - -cdrom_device_info - This structure contains information about a particular CD-ROM drive, - such as its device name, speed, etc. This structure is conceptually - connected to the minor number of the device. - -Registering a particular CD-ROM drive with the Uniform CD-ROM Driver -is done by the low-level device driver though a call to:: - - register_cdrom(struct cdrom_device_info * _info) - -The device information structure, *_info*, contains all the -information needed for the kernel to interface with the low-level -CD-ROM device driver. One of the most important entries in this -structure is a pointer to the *cdrom_device_ops* structure of the -low-level driver. - -The device operations structure, *cdrom_device_ops*, contains a list -of pointers to the functions which are implemented in the low-level -device driver. When `cdrom.c` accesses a CD-ROM device, it does it -through the functions in this structure. It is impossible to know all -the capabilities of future CD-ROM drives, so it is expected that this -list may need to be expanded from time to time as new technologies are -developed. For example, CD-R and CD-R/W drives are beginning to become -popular, and support will soon need to be added for them. For now, the -current *struct* is:: - - struct cdrom_device_ops { - int (*open)(struct cdrom_device_info *, int) - void (*release)(struct cdrom_device_info *); - int (*drive_status)(struct cdrom_device_info *, int); - unsigned int (*check_events)(struct cdrom_device_info *, - unsigned int, int); - int (*media_changed)(struct cdrom_device_info *, int); - int (*tray_move)(struct cdrom_device_info *, int); - int (*lock_door)(struct cdrom_device_info *, int); - int (*select_speed)(struct cdrom_device_info *, int); - int (*select_disc)(struct cdrom_device_info *, int); - int (*get_last_session) (struct cdrom_device_info *, - struct cdrom_multisession *); - int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); - int (*reset)(struct cdrom_device_info *); - int (*audio_ioctl)(struct cdrom_device_info *, - unsigned int, void *); - const int capability; /* capability flags */ - int (*generic_packet)(struct cdrom_device_info *, - struct packet_command *); - }; - -When a low-level device driver implements one of these capabilities, -it should add a function pointer to this *struct*. When a particular -function is not implemented, however, this *struct* should contain a -NULL instead. The *capability* flags specify the capabilities of the -CD-ROM hardware and/or low-level CD-ROM driver when a CD-ROM drive -is registered with the Uniform CD-ROM Driver. - -Note that most functions have fewer parameters than their -*blkdev_fops* counterparts. This is because very little of the -information in the structures *inode* and *file* is used. For most -drivers, the main parameter is the *struct* *cdrom_device_info*, from -which the major and minor number can be extracted. (Most low-level -CD-ROM drivers don't even look at the major and minor number though, -since many of them only support one device.) This will be available -through *dev* in *cdrom_device_info* described below. - -The drive-specific, minor-like information that is registered with -`cdrom.c`, currently contains the following fields:: - - struct cdrom_device_info { - const struct cdrom_device_ops * ops; /* device operations for this major */ - struct list_head list; /* linked list of all device_info */ - struct gendisk * disk; /* matching block layer disk */ - void * handle; /* driver-dependent data */ - - int mask; /* mask of capability: disables them */ - int speed; /* maximum speed for reading data */ - int capacity; /* number of discs in a jukebox */ - - unsigned int options:30; /* options flags */ - unsigned mc_flags:2; /* media-change buffer flags */ - unsigned int vfs_events; /* cached events for vfs path */ - unsigned int ioctl_events; /* cached events for ioctl path */ - int use_count; /* number of times device is opened */ - char name[20]; /* name of the device type */ - - __u8 sanyo_slot : 2; /* Sanyo 3-CD changer support */ - __u8 keeplocked : 1; /* CDROM_LOCKDOOR status */ - __u8 reserved : 5; /* not used yet */ - int cdda_method; /* see CDDA_* flags */ - __u8 last_sense; /* saves last sense key */ - __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ - unsigned short mmc3_profile; /* current MMC3 profile */ - int for_data; /* unknown:TBD */ - int (*exit)(struct cdrom_device_info *);/* unknown:TBD */ - int mrw_mode_page; /* which MRW mode page is in use */ - }; - -Using this *struct*, a linked list of the registered minor devices is -built, using the *next* field. The device number, the device operations -struct and specifications of properties of the drive are stored in this -structure. - -The *mask* flags can be used to mask out some of the capabilities listed -in *ops->capability*, if a specific drive doesn't support a feature -of the driver. The value *speed* specifies the maximum head-rate of the -drive, measured in units of normal audio speed (176kB/sec raw data or -150kB/sec file system data). The parameters are declared *const* -because they describe properties of the drive, which don't change after -registration. - -A few registers contain variables local to the CD-ROM drive. The -flags *options* are used to specify how the general CD-ROM routines -should behave. These various flags registers should provide enough -flexibility to adapt to the different users' wishes (and **not** the -`arbitrary` wishes of the author of the low-level device driver, as is -the case in the old scheme). The register *mc_flags* is used to buffer -the information from *media_changed()* to two separate queues. Other -data that is specific to a minor drive, can be accessed through *handle*, -which can point to a data structure specific to the low-level driver. -The fields *use_count*, *next*, *options* and *mc_flags* need not be -initialized. - -The intermediate software layer that `cdrom.c` forms will perform some -additional bookkeeping. The use count of the device (the number of -processes that have the device opened) is registered in *use_count*. The -function *cdrom_ioctl()* will verify the appropriate user-memory regions -for read and write, and in case a location on the CD is transferred, -it will `sanitize` the format by making requests to the low-level -drivers in a standard format, and translating all formats between the -user-software and low level drivers. This relieves much of the drivers' -memory checking and format checking and translation. Also, the necessary -structures will be declared on the program stack. - -The implementation of the functions should be as defined in the -following sections. Two functions **must** be implemented, namely -*open()* and *release()*. Other functions may be omitted, their -corresponding capability flags will be cleared upon registration. -Generally, a function returns zero on success and negative on error. A -function call should return only after the command has completed, but of -course waiting for the device should not use processor time. - -:: - - int open(struct cdrom_device_info *cdi, int purpose) - -*Open()* should try to open the device for a specific *purpose*, which -can be either: - -- Open for reading data, as done by `mount()` (2), or the - user commands `dd` or `cat`. -- Open for *ioctl* commands, as done by audio-CD playing programs. - -Notice that any strategic code (closing tray upon *open()*, etc.) is -done by the calling routine in `cdrom.c`, so the low-level routine -should only be concerned with proper initialization, such as spinning -up the disc, etc. - -:: - - void release(struct cdrom_device_info *cdi) - -Device-specific actions should be taken such as spinning down the device. -However, strategic actions such as ejection of the tray, or unlocking -the door, should be left over to the general routine *cdrom_release()*. -This is the only function returning type *void*. - -.. _cdrom_drive_status: - -:: - - int drive_status(struct cdrom_device_info *cdi, int slot_nr) - -The function *drive_status*, if implemented, should provide -information on the status of the drive (not the status of the disc, -which may or may not be in the drive). If the drive is not a changer, -*slot_nr* should be ignored. In `cdrom.h` the possibilities are listed:: - - - CDS_NO_INFO /* no information available */ - CDS_NO_DISC /* no disc is inserted, tray is closed */ - CDS_TRAY_OPEN /* tray is opened */ - CDS_DRIVE_NOT_READY /* something is wrong, tray is moving? */ - CDS_DISC_OK /* a disc is loaded and everything is fine */ - -:: - - int media_changed(struct cdrom_device_info *cdi, int disc_nr) - -This function is very similar to the original function in $struct -file_operations*. It returns 1 if the medium of the device *cdi->dev* -has changed since the last call, and 0 otherwise. The parameter -*disc_nr* identifies a specific slot in a juke-box, it should be -ignored for single-disc drives. Note that by `re-routing` this -function through *cdrom_media_changed()*, we can implement separate -queues for the VFS and a new *ioctl()* function that can report device -changes to software (e. g., an auto-mounting daemon). - -:: - - int tray_move(struct cdrom_device_info *cdi, int position) - -This function, if implemented, should control the tray movement. (No -other function should control this.) The parameter *position* controls -the desired direction of movement: - -- 0 Close tray -- 1 Open tray - -This function returns 0 upon success, and a non-zero value upon -error. Note that if the tray is already in the desired position, no -action need be taken, and the return value should be 0. - -:: - - int lock_door(struct cdrom_device_info *cdi, int lock) - -This function (and no other code) controls locking of the door, if the -drive allows this. The value of *lock* controls the desired locking -state: - -- 0 Unlock door, manual opening is allowed -- 1 Lock door, tray cannot be ejected manually - -This function returns 0 upon success, and a non-zero value upon -error. Note that if the door is already in the requested state, no -action need be taken, and the return value should be 0. - -:: - - int select_speed(struct cdrom_device_info *cdi, int speed) - -Some CD-ROM drives are capable of changing their head-speed. There -are several reasons for changing the speed of a CD-ROM drive. Badly -pressed CD-ROM s may benefit from less-than-maximum head rate. Modern -CD-ROM drives can obtain very high head rates (up to *24x* is -common). It has been reported that these drives can make reading -errors at these high speeds, reducing the speed can prevent data loss -in these circumstances. Finally, some of these drives can -make an annoyingly loud noise, which a lower speed may reduce. - -This function specifies the speed at which data is read or audio is -played back. The value of *speed* specifies the head-speed of the -drive, measured in units of standard cdrom speed (176kB/sec raw data -or 150kB/sec file system data). So to request that a CD-ROM drive -operate at 300kB/sec you would call the CDROM_SELECT_SPEED *ioctl* -with *speed=2*. The special value `0` means `auto-selection`, i. e., -maximum data-rate or real-time audio rate. If the drive doesn't have -this `auto-selection` capability, the decision should be made on the -current disc loaded and the return value should be positive. A negative -return value indicates an error. - -:: - - int select_disc(struct cdrom_device_info *cdi, int number) - -If the drive can store multiple discs (a juke-box) this function -will perform disc selection. It should return the number of the -selected disc on success, a negative value on error. Currently, only -the ide-cd driver supports this functionality. - -:: - - int get_last_session(struct cdrom_device_info *cdi, - struct cdrom_multisession *ms_info) - -This function should implement the old corresponding *ioctl()*. For -device *cdi->dev*, the start of the last session of the current disc -should be returned in the pointer argument *ms_info*. Note that -routines in `cdrom.c` have sanitized this argument: its requested -format will **always** be of the type *CDROM_LBA* (linear block -addressing mode), whatever the calling software requested. But -sanitization goes even further: the low-level implementation may -return the requested information in *CDROM_MSF* format if it wishes so -(setting the *ms_info->addr_format* field appropriately, of -course) and the routines in `cdrom.c` will make the transformation if -necessary. The return value is 0 upon success. - -:: - - int get_mcn(struct cdrom_device_info *cdi, - struct cdrom_mcn *mcn) - -Some discs carry a `Media Catalog Number` (MCN), also called -`Universal Product Code` (UPC). This number should reflect the number -that is generally found in the bar-code on the product. Unfortunately, -the few discs that carry such a number on the disc don't even use the -same format. The return argument to this function is a pointer to a -pre-declared memory region of type *struct cdrom_mcn*. The MCN is -expected as a 13-character string, terminated by a null-character. - -:: - - int reset(struct cdrom_device_info *cdi) - -This call should perform a hard-reset on the drive (although in -circumstances that a hard-reset is necessary, a drive may very well not -listen to commands anymore). Preferably, control is returned to the -caller only after the drive has finished resetting. If the drive is no -longer listening, it may be wise for the underlying low-level cdrom -driver to time out. - -:: - - int audio_ioctl(struct cdrom_device_info *cdi, - unsigned int cmd, void *arg) - -Some of the CD-ROM-\ *ioctl()*\ 's defined in `cdrom.h` can be -implemented by the routines described above, and hence the function -*cdrom_ioctl* will use those. However, most *ioctl()*\ 's deal with -audio-control. We have decided to leave these to be accessed through a -single function, repeating the arguments *cmd* and *arg*. Note that -the latter is of type *void*, rather than *unsigned long int*. -The routine *cdrom_ioctl()* does do some useful things, -though. It sanitizes the address format type to *CDROM_MSF* (Minutes, -Seconds, Frames) for all audio calls. It also verifies the memory -location of *arg*, and reserves stack-memory for the argument. This -makes implementation of the *audio_ioctl()* much simpler than in the -old driver scheme. For example, you may look up the function -*cm206_audio_ioctl()* `cm206.c` that should be updated with -this documentation. - -An unimplemented ioctl should return *-ENOSYS*, but a harmless request -(e. g., *CDROMSTART*) may be ignored by returning 0 (success). Other -errors should be according to the standards, whatever they are. When -an error is returned by the low-level driver, the Uniform CD-ROM Driver -tries whenever possible to return the error code to the calling program. -(We may decide to sanitize the return value in *cdrom_ioctl()* though, in -order to guarantee a uniform interface to the audio-player software.) - -:: - - int dev_ioctl(struct cdrom_device_info *cdi, - unsigned int cmd, unsigned long arg) - -Some *ioctl()'s* seem to be specific to certain CD-ROM drives. That is, -they are introduced to service some capabilities of certain drives. In -fact, there are 6 different *ioctl()'s* for reading data, either in some -particular kind of format, or audio data. Not many drives support -reading audio tracks as data, I believe this is because of protection -of copyrights of artists. Moreover, I think that if audio-tracks are -supported, it should be done through the VFS and not via *ioctl()'s*. A -problem here could be the fact that audio-frames are 2352 bytes long, -so either the audio-file-system should ask for 75264 bytes at once -(the least common multiple of 512 and 2352), or the drivers should -bend their backs to cope with this incoherence (to which I would be -opposed). Furthermore, it is very difficult for the hardware to find -the exact frame boundaries, since there are no synchronization headers -in audio frames. Once these issues are resolved, this code should be -standardized in `cdrom.c`. - -Because there are so many *ioctl()'s* that seem to be introduced to -satisfy certain drivers [#f2]_, any non-standard *ioctl()*\ s -are routed through the call *dev_ioctl()*. In principle, `private` -*ioctl()*\ 's should be numbered after the device's major number, and not -the general CD-ROM *ioctl* number, `0x53`. Currently the -non-supported *ioctl()'s* are: - - CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW, - CDROMREADCOOKED, CDROMSEEK, CDROMPLAY-BLK and CDROM-READALL - -.. [#f2] - - Is there software around that actually uses these? I'd be interested! - -.. _cdrom_capabilities: - -CD-ROM capabilities -------------------- - -Instead of just implementing some *ioctl* calls, the interface in -`cdrom.c` supplies the possibility to indicate the **capabilities** -of a CD-ROM drive. This can be done by ORing any number of -capability-constants that are defined in `cdrom.h` at the registration -phase. Currently, the capabilities are any of:: - - CDC_CLOSE_TRAY /* can close tray by software control */ - CDC_OPEN_TRAY /* can open tray */ - CDC_LOCK /* can lock and unlock the door */ - CDC_SELECT_SPEED /* can select speed, in units of * sim*150 ,kB/s */ - CDC_SELECT_DISC /* drive is juke-box */ - CDC_MULTI_SESSION /* can read sessions *> rm1* */ - CDC_MCN /* can read Media Catalog Number */ - CDC_MEDIA_CHANGED /* can report if disc has changed */ - CDC_PLAY_AUDIO /* can perform audio-functions (play, pause, etc) */ - CDC_RESET /* hard reset device */ - CDC_IOCTLS /* driver has non-standard ioctls */ - CDC_DRIVE_STATUS /* driver implements drive status */ - -The capability flag is declared *const*, to prevent drivers from -accidentally tampering with the contents. The capability fags actually -inform `cdrom.c` of what the driver can do. If the drive found -by the driver does not have the capability, is can be masked out by -the *cdrom_device_info* variable *mask*. For instance, the SCSI CD-ROM -driver has implemented the code for loading and ejecting CD-ROM's, and -hence its corresponding flags in *capability* will be set. But a SCSI -CD-ROM drive might be a caddy system, which can't load the tray, and -hence for this drive the *cdrom_device_info* struct will have set -the *CDC_CLOSE_TRAY* bit in *mask*. - -In the file `cdrom.c` you will encounter many constructions of the type:: - - if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ... - -There is no *ioctl* to set the mask... The reason is that -I think it is better to control the **behavior** rather than the -**capabilities**. - -Options -------- - -A final flag register controls the **behavior** of the CD-ROM -drives, in order to satisfy different users' wishes, hopefully -independently of the ideas of the respective author who happened to -have made the drive's support available to the Linux community. The -current behavior options are:: - - CDO_AUTO_CLOSE /* try to close tray upon device open() */ - CDO_AUTO_EJECT /* try to open tray on last device close() */ - CDO_USE_FFLAGS /* use file_pointer->f_flags to indicate purpose for open() */ - CDO_LOCK /* try to lock door if device is opened */ - CDO_CHECK_TYPE /* ensure disc type is data if opened for data */ - -The initial value of this register is -`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`, reflecting my own view on user -interface and software standards. Before you protest, there are two -new *ioctl()'s* implemented in `cdrom.c`, that allow you to control the -behavior by software. These are:: - - CDROM_SET_OPTIONS /* set options specified in (int)arg */ - CDROM_CLEAR_OPTIONS /* clear options specified in (int)arg */ - -One option needs some more explanation: *CDO_USE_FFLAGS*. In the next -newsection we explain what the need for this option is. - -A software package `setcd`, available from the Debian distribution -and `sunsite.unc.edu`, allows user level control of these flags. - - -The need to know the purpose of opening the CD-ROM device -========================================================= - -Traditionally, Unix devices can be used in two different `modes`, -either by reading/writing to the device file, or by issuing -controlling commands to the device, by the device's *ioctl()* -call. The problem with CD-ROM drives, is that they can be used for -two entirely different purposes. One is to mount removable -file systems, CD-ROM's, the other is to play audio CD's. Audio commands -are implemented entirely through *ioctl()\'s*, presumably because the -first implementation (SUN?) has been such. In principle there is -nothing wrong with this, but a good control of the `CD player` demands -that the device can **always** be opened in order to give the -*ioctl* commands, regardless of the state the drive is in. - -On the other hand, when used as a removable-media disc drive (what the -original purpose of CD-ROM s is) we would like to make sure that the -disc drive is ready for operation upon opening the device. In the old -scheme, some CD-ROM drivers don't do any integrity checking, resulting -in a number of i/o errors reported by the VFS to the kernel when an -attempt for mounting a CD-ROM on an empty drive occurs. This is not a -particularly elegant way to find out that there is no CD-ROM inserted; -it more-or-less looks like the old IBM-PC trying to read an empty floppy -drive for a couple of seconds, after which the system complains it -can't read from it. Nowadays we can **sense** the existence of a -removable medium in a drive, and we believe we should exploit that -fact. An integrity check on opening of the device, that verifies the -availability of a CD-ROM and its correct type (data), would be -desirable. - -These two ways of using a CD-ROM drive, principally for data and -secondarily for playing audio discs, have different demands for the -behavior of the *open()* call. Audio use simply wants to open the -device in order to get a file handle which is needed for issuing -*ioctl* commands, while data use wants to open for correct and -reliable data transfer. The only way user programs can indicate what -their *purpose* of opening the device is, is through the *flags* -parameter (see `open(2)`). For CD-ROM devices, these flags aren't -implemented (some drivers implement checking for write-related flags, -but this is not strictly necessary if the device file has correct -permission flags). Most option flags simply don't make sense to -CD-ROM devices: *O_CREAT*, *O_NOCTTY*, *O_TRUNC*, *O_APPEND*, and -*O_SYNC* have no meaning to a CD-ROM. - -We therefore propose to use the flag *O_NONBLOCK* to indicate -that the device is opened just for issuing *ioctl* -commands. Strictly, the meaning of *O_NONBLOCK* is that opening and -subsequent calls to the device don't cause the calling process to -wait. We could interpret this as don't wait until someone has -inserted some valid data-CD-ROM. Thus, our proposal of the -implementation for the *open()* call for CD-ROM s is: - -- If no other flags are set than *O_RDONLY*, the device is opened - for data transfer, and the return value will be 0 only upon successful - initialization of the transfer. The call may even induce some actions - on the CD-ROM, such as closing the tray. -- If the option flag *O_NONBLOCK* is set, opening will always be - successful, unless the whole device doesn't exist. The drive will take - no actions whatsoever. - -And what about standards? -------------------------- - -You might hesitate to accept this proposal as it comes from the -Linux community, and not from some standardizing institute. What -about SUN, SGI, HP and all those other Unix and hardware vendors? -Well, these companies are in the lucky position that they generally -control both the hardware and software of their supported products, -and are large enough to set their own standard. They do not have to -deal with a dozen or more different, competing hardware -configurations\ [#f3]_. - -.. [#f3] - - Incidentally, I think that SUN's approach to mounting CD-ROM s is very - good in origin: under Solaris a volume-daemon automatically mounts a - newly inserted CD-ROM under `/cdrom/**`. - - In my opinion they should have pushed this - further and have **every** CD-ROM on the local area network be - mounted at the similar location, i. e., no matter in which particular - machine you insert a CD-ROM, it will always appear at the same - position in the directory tree, on every system. When I wanted to - implement such a user-program for Linux, I came across the - differences in behavior of the various drivers, and the need for an - *ioctl* informing about media changes. - -We believe that using *O_NONBLOCK* to indicate that a device is being opened -for *ioctl* commands only can be easily introduced in the Linux -community. All the CD-player authors will have to be informed, we can -even send in our own patches to the programs. The use of *O_NONBLOCK* -has most likely no influence on the behavior of the CD-players on -other operating systems than Linux. Finally, a user can always revert -to old behavior by a call to -*ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, CDO_USE_FFLAGS)*. - -The preferred strategy of *open()* ----------------------------------- - -The routines in `cdrom.c` are designed in such a way that run-time -configuration of the behavior of CD-ROM devices (of **any** type) -can be carried out, by the *CDROM_SET/CLEAR_OPTIONS* *ioctls*. Thus, various -modes of operation can be set: - -`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK` - This is the default setting. (With *CDO_CHECK_TYPE* it will be better, in - the future.) If the device is not yet opened by any other process, and if - the device is being opened for data (*O_NONBLOCK* is not set) and the - tray is found to be open, an attempt to close the tray is made. Then, - it is verified that a disc is in the drive and, if *CDO_CHECK_TYPE* is - set, that it contains tracks of type `data mode 1`. Only if all tests - are passed is the return value zero. The door is locked to prevent file - system corruption. If the drive is opened for audio (*O_NONBLOCK* is - set), no actions are taken and a value of 0 will be returned. - -`CDO_AUTO_CLOSE | CDO_AUTO_EJECT | CDO_LOCK` - This mimics the behavior of the current sbpcd-driver. The option flags are - ignored, the tray is closed on the first open, if necessary. Similarly, - the tray is opened on the last release, i. e., if a CD-ROM is unmounted, - it is automatically ejected, such that the user can replace it. - -We hope that these option can convince everybody (both driver -maintainers and user program developers) to adopt the new CD-ROM -driver scheme and option flag interpretation. - -Description of routines in `cdrom.c` -==================================== - -Only a few routines in `cdrom.c` are exported to the drivers. In this -new section we will discuss these, as well as the functions that `take -over' the CD-ROM interface to the kernel. The header file belonging -to `cdrom.c` is called `cdrom.h`. Formerly, some of the contents of this -file were placed in the file `ucdrom.h`, but this file has now been -merged back into `cdrom.h`. - -:: - - struct file_operations cdrom_fops - -The contents of this structure were described in cdrom_api_. -A pointer to this structure is assigned to the *fops* field -of the *struct gendisk*. - -:: - - int register_cdrom(struct cdrom_device_info *cdi) - -This function is used in about the same way one registers *cdrom_fops* -with the kernel, the device operations and information structures, -as described in cdrom_api_, should be registered with the -Uniform CD-ROM Driver:: - - register_cdrom(&_info); - - -This function returns zero upon success, and non-zero upon -failure. The structure *_info* should have a pointer to the -driver's *_dops*, as in:: - - struct cdrom_device_info _info = { - _dops; - ... - } - -Note that a driver must have one static structure, *_dops*, while -it may have as many structures *_info* as there are minor devices -active. *Register_cdrom()* builds a linked list from these. - - -:: - - void unregister_cdrom(struct cdrom_device_info *cdi) - -Unregistering device *cdi* with minor number *MINOR(cdi->dev)* removes -the minor device from the list. If it was the last registered minor for -the low-level driver, this disconnects the registered device-operation -routines from the CD-ROM interface. This function returns zero upon -success, and non-zero upon failure. - -:: - - int cdrom_open(struct inode * ip, struct file * fp) - -This function is not called directly by the low-level drivers, it is -listed in the standard *cdrom_fops*. If the VFS opens a file, this -function becomes active. A strategy is implemented in this routine, -taking care of all capabilities and options that are set in the -*cdrom_device_ops* connected to the device. Then, the program flow is -transferred to the device_dependent *open()* call. - -:: - - void cdrom_release(struct inode *ip, struct file *fp) - -This function implements the reverse-logic of *cdrom_open()*, and then -calls the device-dependent *release()* routine. When the use-count has -reached 0, the allocated buffers are flushed by calls to *sync_dev(dev)* -and *invalidate_buffers(dev)*. - - -.. _cdrom_ioctl: - -:: - - int cdrom_ioctl(struct inode *ip, struct file *fp, - unsigned int cmd, unsigned long arg) - -This function handles all the standard *ioctl* requests for CD-ROM -devices in a uniform way. The different calls fall into three -categories: *ioctl()'s* that can be directly implemented by device -operations, ones that are routed through the call *audio_ioctl()*, and -the remaining ones, that are presumable device-dependent. Generally, a -negative return value indicates an error. - -Directly implemented *ioctl()'s* --------------------------------- - -The following `old` CD-ROM *ioctl()*\ 's are implemented by directly -calling device-operations in *cdrom_device_ops*, if implemented and -not masked: - -`CDROMMULTISESSION` - Requests the last session on a CD-ROM. -`CDROMEJECT` - Open tray. -`CDROMCLOSETRAY` - Close tray. -`CDROMEJECT_SW` - If *arg\not=0*, set behavior to auto-close (close - tray on first open) and auto-eject (eject on last release), otherwise - set behavior to non-moving on *open()* and *release()* calls. -`CDROM_GET_MCN` - Get the Media Catalog Number from a CD. - -*Ioctl*s routed through *audio_ioctl()* ---------------------------------------- - -The following set of *ioctl()'s* are all implemented through a call to -the *cdrom_fops* function *audio_ioctl()*. Memory checks and -allocation are performed in *cdrom_ioctl()*, and also sanitization of -address format (*CDROM_LBA*/*CDROM_MSF*) is done. - -`CDROMSUBCHNL` - Get sub-channel data in argument *arg* of type - `struct cdrom_subchnl *`. -`CDROMREADTOCHDR` - Read Table of Contents header, in *arg* of type - `struct cdrom_tochdr *`. -`CDROMREADTOCENTRY` - Read a Table of Contents entry in *arg* and specified by *arg* - of type `struct cdrom_tocentry *`. -`CDROMPLAYMSF` - Play audio fragment specified in Minute, Second, Frame format, - delimited by *arg* of type `struct cdrom_msf *`. -`CDROMPLAYTRKIND` - Play audio fragment in track-index format delimited by *arg* - of type `struct cdrom_ti *`. -`CDROMVOLCTRL` - Set volume specified by *arg* of type `struct cdrom_volctrl *`. -`CDROMVOLREAD` - Read volume into by *arg* of type `struct cdrom_volctrl *`. -`CDROMSTART` - Spin up disc. -`CDROMSTOP` - Stop playback of audio fragment. -`CDROMPAUSE` - Pause playback of audio fragment. -`CDROMRESUME` - Resume playing. - -New *ioctl()'s* in `cdrom.c` ----------------------------- - -The following *ioctl()'s* have been introduced to allow user programs to -control the behavior of individual CD-ROM devices. New *ioctl* -commands can be identified by the underscores in their names. - -`CDROM_SET_OPTIONS` - Set options specified by *arg*. Returns the option flag register - after modification. Use *arg = \rm0* for reading the current flags. -`CDROM_CLEAR_OPTIONS` - Clear options specified by *arg*. Returns the option flag register - after modification. -`CDROM_SELECT_SPEED` - Select head-rate speed of disc specified as by *arg* in units - of standard cdrom speed (176\,kB/sec raw data or - 150kB/sec file system data). The value 0 means `auto-select`, - i. e., play audio discs at real time and data discs at maximum speed. - The value *arg* is checked against the maximum head rate of the - drive found in the *cdrom_dops*. -`CDROM_SELECT_DISC` - Select disc numbered *arg* from a juke-box. - - First disc is numbered 0. The number *arg* is checked against the - maximum number of discs in the juke-box found in the *cdrom_dops*. -`CDROM_MEDIA_CHANGED` - Returns 1 if a disc has been changed since the last call. - Note that calls to *cdrom_media_changed* by the VFS are treated - by an independent queue, so both mechanisms will detect a - media change once. For juke-boxes, an extra argument *arg* - specifies the slot for which the information is given. The special - value *CDSL_CURRENT* requests that information about the currently - selected slot be returned. -`CDROM_DRIVE_STATUS` - Returns the status of the drive by a call to - *drive_status()*. Return values are defined in cdrom_drive_status_. - Note that this call doesn't return information on the - current playing activity of the drive; this can be polled through - an *ioctl* call to *CDROMSUBCHNL*. For juke-boxes, an extra argument - *arg* specifies the slot for which (possibly limited) information is - given. The special value *CDSL_CURRENT* requests that information - about the currently selected slot be returned. -`CDROM_DISC_STATUS` - Returns the type of the disc currently in the drive. - It should be viewed as a complement to *CDROM_DRIVE_STATUS*. - This *ioctl* can provide *some* information about the current - disc that is inserted in the drive. This functionality used to be - implemented in the low level drivers, but is now carried out - entirely in Uniform CD-ROM Driver. - - The history of development of the CD's use as a carrier medium for - various digital information has lead to many different disc types. - This *ioctl* is useful only in the case that CDs have \emph {only - one} type of data on them. While this is often the case, it is - also very common for CDs to have some tracks with data, and some - tracks with audio. Because this is an existing interface, rather - than fixing this interface by changing the assumptions it was made - under, thereby breaking all user applications that use this - function, the Uniform CD-ROM Driver implements this *ioctl* as - follows: If the CD in question has audio tracks on it, and it has - absolutely no CD-I, XA, or data tracks on it, it will be reported - as *CDS_AUDIO*. If it has both audio and data tracks, it will - return *CDS_MIXED*. If there are no audio tracks on the disc, and - if the CD in question has any CD-I tracks on it, it will be - reported as *CDS_XA_2_2*. Failing that, if the CD in question - has any XA tracks on it, it will be reported as *CDS_XA_2_1*. - Finally, if the CD in question has any data tracks on it, - it will be reported as a data CD (*CDS_DATA_1*). - - This *ioctl* can return:: - - CDS_NO_INFO /* no information available */ - CDS_NO_DISC /* no disc is inserted, or tray is opened */ - CDS_AUDIO /* Audio disc (2352 audio bytes/frame) */ - CDS_DATA_1 /* data disc, mode 1 (2048 user bytes/frame) */ - CDS_XA_2_1 /* mixed data (XA), mode 2, form 1 (2048 user bytes) */ - CDS_XA_2_2 /* mixed data (XA), mode 2, form 1 (2324 user bytes) */ - CDS_MIXED /* mixed audio/data disc */ - - For some information concerning frame layout of the various disc - types, see a recent version of `cdrom.h`. - -`CDROM_CHANGER_NSLOTS` - Returns the number of slots in a juke-box. -`CDROMRESET` - Reset the drive. -`CDROM_GET_CAPABILITY` - Returns the *capability* flags for the drive. Refer to section - cdrom_capabilities_ for more information on these flags. -`CDROM_LOCKDOOR` - Locks the door of the drive. `arg == 0` unlocks the door, - any other value locks it. -`CDROM_DEBUG` - Turns on debugging info. Only root is allowed to do this. - Same semantics as CDROM_LOCKDOOR. - - -Device dependent *ioctl()'s* ----------------------------- - -Finally, all other *ioctl()'s* are passed to the function *dev_ioctl()*, -if implemented. No memory allocation or verification is carried out. - -How to update your driver -========================= - -- Make a backup of your current driver. -- Get hold of the files `cdrom.c` and `cdrom.h`, they should be in - the directory tree that came with this documentation. -- Make sure you include `cdrom.h`. -- Change the 3rd argument of *register_blkdev* from `&_fops` - to `&cdrom_fops`. -- Just after that line, add the following to register with the Uniform - CD-ROM Driver:: - - register_cdrom(&_info);* - - Similarly, add a call to *unregister_cdrom()* at the appropriate place. -- Copy an example of the device-operations *struct* to your - source, e. g., from `cm206.c` *cm206_dops*, and change all - entries to names corresponding to your driver, or names you just - happen to like. If your driver doesn't support a certain function, - make the entry *NULL*. At the entry *capability* you should list all - capabilities your driver currently supports. If your driver - has a capability that is not listed, please send me a message. -- Copy the *cdrom_device_info* declaration from the same example - driver, and modify the entries according to your needs. If your - driver dynamically determines the capabilities of the hardware, this - structure should also be declared dynamically. -- Implement all functions in your `_dops` structure, - according to prototypes listed in `cdrom.h`, and specifications given - in cdrom_api_. Most likely you have already implemented - the code in a large part, and you will almost certainly need to adapt the - prototype and return values. -- Rename your `_ioctl()` function to *audio_ioctl* and - change the prototype a little. Remove entries listed in the first - part in cdrom_ioctl_, if your code was OK, these are - just calls to the routines you adapted in the previous step. -- You may remove all remaining memory checking code in the - *audio_ioctl()* function that deals with audio commands (these are - listed in the second part of cdrom_ioctl_. There is no - need for memory allocation either, so most *case*s in the *switch* - statement look similar to:: - - case CDROMREADTOCENTRY: - get_toc_entry\bigl((struct cdrom_tocentry *) arg); - -- All remaining *ioctl* cases must be moved to a separate - function, *_ioctl*, the device-dependent *ioctl()'s*. Note that - memory checking and allocation must be kept in this code! -- Change the prototypes of *_open()* and - *_release()*, and remove any strategic code (i. e., tray - movement, door locking, etc.). -- Try to recompile the drivers. We advise you to use modules, both - for `cdrom.o` and your driver, as debugging is much easier this - way. - -Thanks -====== - -Thanks to all the people involved. First, Erik Andersen, who has -taken over the torch in maintaining `cdrom.c` and integrating much -CD-ROM-related code in the 2.1-kernel. Thanks to Scott Snyder and -Gerd Knorr, who were the first to implement this interface for SCSI -and IDE-CD drivers and added many ideas for extension of the data -structures relative to kernel~2.0. Further thanks to Heiko Eißfeldt, -Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard Mönkeberg and Andrew Kroll, -the Linux CD-ROM device driver developers who were kind -enough to give suggestions and criticisms during the writing. Finally -of course, I want to thank Linus Torvalds for making this possible in -the first place. diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd deleted file mode 100644 index a5f2a7f1ff46..000000000000 --- a/Documentation/cdrom/ide-cd +++ /dev/null @@ -1,534 +0,0 @@ -IDE-CD driver documentation -Originally by scott snyder (19 May 1996) -Carrying on the torch is: Erik Andersen -New maintainers (19 Oct 1998): Jens Axboe - -1. Introduction ---------------- - -The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant -CDROM drives which attach to an IDE interface. Note that some CDROM vendors -(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made -both ATAPI-compliant drives and drives which use a proprietary -interface. If your drive uses one of those proprietary interfaces, -this driver will not work with it (but one of the other CDROM drivers -probably will). This driver will not work with `ATAPI' drives which -attach to the parallel port. In addition, there is at least one drive -(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI; -this driver will not work with drives like that either (but see the -aztcd driver). - -This driver provides the following features: - - - Reading from data tracks, and mounting ISO 9660 filesystems. - - - Playing audio tracks. Most of the CDROM player programs floating - around should work; I usually use Workman. - - - Multisession support. - - - On drives which support it, reading digital audio data directly - from audio tracks. The program cdda2wav can be used for this. - Note, however, that only some drives actually support this. - - - There is now support for CDROM changers which comply with the - ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional - functionality includes a function call to query which slot is the - currently selected slot, a function call to query which slots contain - CDs, etc. A sample program which demonstrates this functionality is - appended to the end of this file. The Sanyo 3-disc changer - (which does not conform to the standard) is also now supported. - Please note the driver refers to the first CD as slot # 0. - - -2. Installation ---------------- - -0. The ide-cd relies on the ide disk driver. See - Documentation/ide/ide.txt for up-to-date information on the ide - driver. - -1. Make sure that the ide and ide-cd drivers are compiled into the - kernel you're using. When configuring the kernel, in the section - entitled "Floppy, IDE, and other block devices", say either `Y' - (which will compile the support directly into the kernel) or `M' - (to compile support as a module which can be loaded and unloaded) - to the options: - - ATA/ATAPI/MFM/RLL support - Include IDE/ATAPI CDROM support - - Depending on what type of IDE interface you have, you may need to - specify additional configuration options. See - Documentation/ide/ide.txt. - -2. You should also ensure that the iso9660 filesystem is either - compiled into the kernel or available as a loadable module. You - can see if a filesystem is known to the kernel by catting - /proc/filesystems. - -3. The CDROM drive should be connected to the host on an IDE - interface. Each interface on a system is defined by an I/O port - address and an IRQ number, the standard assignments being - 0x1f0 and 14 for the primary interface and 0x170 and 15 for the - secondary interface. Each interface can control up to two devices, - where each device can be a hard drive, a CDROM drive, a floppy drive, - or a tape drive. The two devices on an interface are called `master' - and `slave'; this is usually selectable via a jumper on the drive. - - Linux names these devices as follows. The master and slave devices - on the primary IDE interface are called `hda' and `hdb', - respectively. The drives on the secondary interface are called - `hdc' and `hdd'. (Interfaces at other locations get other letters - in the third position; see Documentation/ide/ide.txt.) - - If you want your CDROM drive to be found automatically by the - driver, you should make sure your IDE interface uses either the - primary or secondary addresses mentioned above. In addition, if - the CDROM drive is the only device on the IDE interface, it should - be jumpered as `master'. (If for some reason you cannot configure - your system in this manner, you can probably still use the driver. - You may have to pass extra configuration information to the kernel - when you boot, however. See Documentation/ide/ide.txt for more - information.) - -4. Boot the system. If the drive is recognized, you should see a - message which looks like - - hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive - - If you do not see this, see section 5 below. - -5. You may want to create a symbolic link /dev/cdrom pointing to the - actual device. You can do this with the command - - ln -s /dev/hdX /dev/cdrom - - where X should be replaced by the letter indicating where your - drive is installed. - -6. You should be able to see any error messages from the driver with - the `dmesg' command. - - -3. Basic usage --------------- - -An ISO 9660 CDROM can be mounted by putting the disc in the drive and -typing (as root) - - mount -t iso9660 /dev/cdrom /mnt/cdrom - -where it is assumed that /dev/cdrom is a link pointing to the actual -device (as described in step 5 of the last section) and /mnt/cdrom is -an empty directory. You should now be able to see the contents of the -CDROM under the /mnt/cdrom directory. If you want to eject the CDROM, -you must first dismount it with a command like - - umount /mnt/cdrom - -Note that audio CDs cannot be mounted. - -Some distributions set up /etc/fstab to always try to mount a CDROM -filesystem on bootup. It is not required to mount the CDROM in this -manner, though, and it may be a nuisance if you change CDROMs often. -You should feel free to remove the cdrom line from /etc/fstab and -mount CDROMs manually if that suits you better. - -Multisession and photocd discs should work with no special handling. -The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be -useful for reading photocds. - -To play an audio CD, you should first unmount and remove any data -CDROM. Any of the CDROM player programs should then work (workman, -workbone, cdplayer, etc.). - -On a few drives, you can read digital audio directly using a program -such as cdda2wav. The only types of drive which I've heard support -this are Sony and Toshiba drives. You will get errors if you try to -use this function on a drive which does not support it. - -For supported changers, you can use the `cdchange' program (appended to -the end of this file) to switch between changer slots. Note that the -drive should be unmounted before attempting this. The program takes -two arguments: the CDROM device, and the slot number to which you wish -to change. If the slot number is -1, the drive is unloaded. - - -4. Common problems ------------------- - -This section discusses some common problems encountered when trying to -use the driver, and some possible solutions. Note that if you are -experiencing problems, you should probably also review -Documentation/ide/ide.txt for current information about the underlying -IDE support code. Some of these items apply only to earlier versions -of the driver, but are mentioned here for completeness. - -In most cases, you should probably check with `dmesg' for any errors -from the driver. - -a. Drive is not detected during booting. - - - Review the configuration instructions above and in - Documentation/ide/ide.txt, and check how your hardware is - configured. - - - If your drive is the only device on an IDE interface, it should - be jumpered as master, if at all possible. - - - If your IDE interface is not at the standard addresses of 0x170 - or 0x1f0, you'll need to explicitly inform the driver using a - lilo option. See Documentation/ide/ide.txt. (This feature was - added around kernel version 1.3.30.) - - - If the autoprobing is not finding your drive, you can tell the - driver to assume that one exists by using a lilo option of the - form `hdX=cdrom', where X is the drive letter corresponding to - where your drive is installed. Note that if you do this and you - see a boot message like - - hdX: ATAPI cdrom (?) - - this does _not_ mean that the driver has successfully detected - the drive; rather, it means that the driver has not detected a - drive, but is assuming there's one there anyway because you told - it so. If you actually try to do I/O to a drive defined at a - nonexistent or nonresponding I/O address, you'll probably get - errors with a status value of 0xff. - - - Some IDE adapters require a nonstandard initialization sequence - before they'll function properly. (If this is the case, there - will often be a separate MS-DOS driver just for the controller.) - IDE interfaces on sound cards often fall into this category. - - Support for some interfaces needing extra initialization is - provided in later 1.3.x kernels. You may need to turn on - additional kernel configuration options to get them to work; - see Documentation/ide/ide.txt. - - Even if support is not available for your interface, you may be - able to get it to work with the following procedure. First boot - MS-DOS and load the appropriate drivers. Then warm-boot linux - (i.e., without powering off). If this works, it can be automated - by running loadlin from the MS-DOS autoexec. - - -b. Timeout/IRQ errors. - - - If you always get timeout errors, interrupts from the drive are - probably not making it to the host. - - - IRQ problems may also be indicated by the message - `IRQ probe failed ()' while booting. If is zero, that - means that the system did not see an interrupt from the drive when - it was expecting one (on any feasible IRQ). If is negative, - that means the system saw interrupts on multiple IRQ lines, when - it was expecting to receive just one from the CDROM drive. - - - Double-check your hardware configuration to make sure that the IRQ - number of your IDE interface matches what the driver expects. - (The usual assignments are 14 for the primary (0x1f0) interface - and 15 for the secondary (0x170) interface.) Also be sure that - you don't have some other hardware which might be conflicting with - the IRQ you're using. Also check the BIOS setup for your system; - some have the ability to disable individual IRQ levels, and I've - had one report of a system which was shipped with IRQ 15 disabled - by default. - - - Note that many MS-DOS CDROM drivers will still function even if - there are hardware problems with the interrupt setup; they - apparently don't use interrupts. - - - If you own a Pioneer DR-A24X, you _will_ get nasty error messages - on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }" - The Pioneer DR-A24X CDROM drives are fairly popular these days. - Unfortunately, these drives seem to become very confused when we perform - the standard Linux ATA disk drive probe. If you own one of these drives, - you can bypass the ATA probing which confuses these CDROM drives, by - adding `append="hdX=noprobe hdX=cdrom"' to your lilo.conf file and running - lilo (again where X is the drive letter corresponding to where your drive - is installed.) - -c. System hangups. - - - If the system locks up when you try to access the CDROM, the most - likely cause is that you have a buggy IDE adapter which doesn't - properly handle simultaneous transactions on multiple interfaces. - The most notorious of these is the CMD640B chip. This problem can - be worked around by specifying the `serialize' option when - booting. Recent kernels should be able to detect the need for - this automatically in most cases, but the detection is not - foolproof. See Documentation/ide/ide.txt for more information - about the `serialize' option and the CMD640B. - - - Note that many MS-DOS CDROM drivers will work with such buggy - hardware, apparently because they never attempt to overlap CDROM - operations with other disk activity. - - -d. Can't mount a CDROM. - - - If you get errors from mount, it may help to check `dmesg' to see - if there are any more specific errors from the driver or from the - filesystem. - - - Make sure there's a CDROM loaded in the drive, and that's it's an - ISO 9660 disc. You can't mount an audio CD. - - - With the CDROM in the drive and unmounted, try something like - - cat /dev/cdrom | od | more - - If you see a dump, then the drive and driver are probably working - OK, and the problem is at the filesystem level (i.e., the CDROM is - not ISO 9660 or has errors in the filesystem structure). - - - If you see `not a block device' errors, check that the definitions - of the device special files are correct. They should be as - follows: - - brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda - brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb - brw-rw---- 1 root disk 22, 0 Nov 11 18:48 /dev/hdc - brw-rw---- 1 root disk 22, 64 Nov 11 18:48 /dev/hdd - - Some early Slackware releases had these defined incorrectly. If - these are wrong, you can remake them by running the script - scripts/MAKEDEV.ide. (You may have to make it executable - with chmod first.) - - If you have a /dev/cdrom symbolic link, check that it is pointing - to the correct device file. - - If you hear people talking of the devices `hd1a' and `hd1b', these - were old names for what are now called hdc and hdd. Those names - should be considered obsolete. - - - If mount is complaining that the iso9660 filesystem is not - available, but you know it is (check /proc/filesystems), you - probably need a newer version of mount. Early versions would not - always give meaningful error messages. - - -e. Directory listings are unpredictably truncated, and `dmesg' shows - `buffer botch' error messages from the driver. - - - There was a bug in the version of the driver in 1.2.x kernels - which could cause this. It was fixed in 1.3.0. If you can't - upgrade, you can probably work around the problem by specifying a - blocksize of 2048 when mounting. (Note that you won't be able to - directly execute binaries off the CDROM in that case.) - - If you see this in kernels later than 1.3.0, please report it as a - bug. - - -f. Data corruption. - - - Random data corruption was occasionally observed with the Hitachi - CDR-7730 CDROM. If you experience data corruption, using "hdx=slow" - as a command line parameter may work around the problem, at the - expense of low system performance. - - -5. cdchange.c -------------- - -/* - * cdchange.c [-v] [] - * - * This loads a CDROM from a specified slot in a changer, and displays - * information about the changer status. The drive should be unmounted before - * using this program. - * - * Changer information is displayed if either the -v flag is specified - * or no slot was specified. - * - * Based on code originally from Gerhard Zuber . - * Changer status information, and rewrite for the new Uniform CDROM driver - * interface by Erik Andersen . - */ - -#include -#include -#include -#include -#include -#include -#include -#include - - -int -main (int argc, char **argv) -{ - char *program; - char *device; - int fd; /* file descriptor for CD-ROM device */ - int status; /* return status for system calls */ - int verbose = 0; - int slot=-1, x_slot; - int total_slots_available; - - program = argv[0]; - - ++argv; - --argc; - - if (argc < 1 || argc > 3) { - fprintf (stderr, "usage: %s [-v] []\n", - program); - fprintf (stderr, " Slots are numbered 1 -- n.\n"); - exit (1); - } - - if (strcmp (argv[0], "-v") == 0) { - verbose = 1; - ++argv; - --argc; - } - - device = argv[0]; - - if (argc == 2) - slot = atoi (argv[1]) - 1; - - /* open device */ - fd = open(device, O_RDONLY | O_NONBLOCK); - if (fd < 0) { - fprintf (stderr, "%s: open failed for `%s': %s\n", - program, device, strerror (errno)); - exit (1); - } - - /* Check CD player status */ - total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS); - if (total_slots_available <= 1 ) { - fprintf (stderr, "%s: Device `%s' is not an ATAPI " - "compliant CD changer.\n", program, device); - exit (1); - } - - if (slot >= 0) { - if (slot >= total_slots_available) { - fprintf (stderr, "Bad slot number. " - "Should be 1 -- %d.\n", - total_slots_available); - exit (1); - } - - /* load */ - slot=ioctl (fd, CDROM_SELECT_DISC, slot); - if (slot<0) { - fflush(stdout); - perror ("CDROM_SELECT_DISC "); - exit(1); - } - } - - if (slot < 0 || verbose) { - - status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT); - if (status<0) { - fflush(stdout); - perror (" CDROM_SELECT_DISC"); - exit(1); - } - slot=status; - - printf ("Current slot: %d\n", slot+1); - printf ("Total slots available: %d\n", - total_slots_available); - - printf ("Drive status: "); - status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); - if (status<0) { - perror(" CDROM_DRIVE_STATUS"); - } else switch(status) { - case CDS_DISC_OK: - printf ("Ready.\n"); - break; - case CDS_TRAY_OPEN: - printf ("Tray Open.\n"); - break; - case CDS_DRIVE_NOT_READY: - printf ("Drive Not Ready.\n"); - break; - default: - printf ("This Should not happen!\n"); - break; - } - - for (x_slot=0; x_slot (19 May 1996) +:Carrying on the torch is: Erik Andersen +:New maintainers (19 Oct 1998): Jens Axboe + +1. Introduction +--------------- + +The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant +CDROM drives which attach to an IDE interface. Note that some CDROM vendors +(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made +both ATAPI-compliant drives and drives which use a proprietary +interface. If your drive uses one of those proprietary interfaces, +this driver will not work with it (but one of the other CDROM drivers +probably will). This driver will not work with `ATAPI` drives which +attach to the parallel port. In addition, there is at least one drive +(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI; +this driver will not work with drives like that either (but see the +aztcd driver). + +This driver provides the following features: + + - Reading from data tracks, and mounting ISO 9660 filesystems. + + - Playing audio tracks. Most of the CDROM player programs floating + around should work; I usually use Workman. + + - Multisession support. + + - On drives which support it, reading digital audio data directly + from audio tracks. The program cdda2wav can be used for this. + Note, however, that only some drives actually support this. + + - There is now support for CDROM changers which comply with the + ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional + functionality includes a function call to query which slot is the + currently selected slot, a function call to query which slots contain + CDs, etc. A sample program which demonstrates this functionality is + appended to the end of this file. The Sanyo 3-disc changer + (which does not conform to the standard) is also now supported. + Please note the driver refers to the first CD as slot # 0. + + +2. Installation +--------------- + +0. The ide-cd relies on the ide disk driver. See + Documentation/ide/ide.txt for up-to-date information on the ide + driver. + +1. Make sure that the ide and ide-cd drivers are compiled into the + kernel you're using. When configuring the kernel, in the section + entitled "Floppy, IDE, and other block devices", say either `Y` + (which will compile the support directly into the kernel) or `M` + (to compile support as a module which can be loaded and unloaded) + to the options:: + + ATA/ATAPI/MFM/RLL support + Include IDE/ATAPI CDROM support + + Depending on what type of IDE interface you have, you may need to + specify additional configuration options. See + Documentation/ide/ide.txt. + +2. You should also ensure that the iso9660 filesystem is either + compiled into the kernel or available as a loadable module. You + can see if a filesystem is known to the kernel by catting + /proc/filesystems. + +3. The CDROM drive should be connected to the host on an IDE + interface. Each interface on a system is defined by an I/O port + address and an IRQ number, the standard assignments being + 0x1f0 and 14 for the primary interface and 0x170 and 15 for the + secondary interface. Each interface can control up to two devices, + where each device can be a hard drive, a CDROM drive, a floppy drive, + or a tape drive. The two devices on an interface are called `master` + and `slave`; this is usually selectable via a jumper on the drive. + + Linux names these devices as follows. The master and slave devices + on the primary IDE interface are called `hda` and `hdb`, + respectively. The drives on the secondary interface are called + `hdc` and `hdd`. (Interfaces at other locations get other letters + in the third position; see Documentation/ide/ide.txt.) + + If you want your CDROM drive to be found automatically by the + driver, you should make sure your IDE interface uses either the + primary or secondary addresses mentioned above. In addition, if + the CDROM drive is the only device on the IDE interface, it should + be jumpered as `master`. (If for some reason you cannot configure + your system in this manner, you can probably still use the driver. + You may have to pass extra configuration information to the kernel + when you boot, however. See Documentation/ide/ide.txt for more + information.) + +4. Boot the system. If the drive is recognized, you should see a + message which looks like:: + + hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive + + If you do not see this, see section 5 below. + +5. You may want to create a symbolic link /dev/cdrom pointing to the + actual device. You can do this with the command:: + + ln -s /dev/hdX /dev/cdrom + + where X should be replaced by the letter indicating where your + drive is installed. + +6. You should be able to see any error messages from the driver with + the `dmesg` command. + + +3. Basic usage +-------------- + +An ISO 9660 CDROM can be mounted by putting the disc in the drive and +typing (as root):: + + mount -t iso9660 /dev/cdrom /mnt/cdrom + +where it is assumed that /dev/cdrom is a link pointing to the actual +device (as described in step 5 of the last section) and /mnt/cdrom is +an empty directory. You should now be able to see the contents of the +CDROM under the /mnt/cdrom directory. If you want to eject the CDROM, +you must first dismount it with a command like:: + + umount /mnt/cdrom + +Note that audio CDs cannot be mounted. + +Some distributions set up /etc/fstab to always try to mount a CDROM +filesystem on bootup. It is not required to mount the CDROM in this +manner, though, and it may be a nuisance if you change CDROMs often. +You should feel free to remove the cdrom line from /etc/fstab and +mount CDROMs manually if that suits you better. + +Multisession and photocd discs should work with no special handling. +The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be +useful for reading photocds. + +To play an audio CD, you should first unmount and remove any data +CDROM. Any of the CDROM player programs should then work (workman, +workbone, cdplayer, etc.). + +On a few drives, you can read digital audio directly using a program +such as cdda2wav. The only types of drive which I've heard support +this are Sony and Toshiba drives. You will get errors if you try to +use this function on a drive which does not support it. + +For supported changers, you can use the `cdchange` program (appended to +the end of this file) to switch between changer slots. Note that the +drive should be unmounted before attempting this. The program takes +two arguments: the CDROM device, and the slot number to which you wish +to change. If the slot number is -1, the drive is unloaded. + + +4. Common problems +------------------ + +This section discusses some common problems encountered when trying to +use the driver, and some possible solutions. Note that if you are +experiencing problems, you should probably also review +Documentation/ide/ide.txt for current information about the underlying +IDE support code. Some of these items apply only to earlier versions +of the driver, but are mentioned here for completeness. + +In most cases, you should probably check with `dmesg` for any errors +from the driver. + +a. Drive is not detected during booting. + + - Review the configuration instructions above and in + Documentation/ide/ide.txt, and check how your hardware is + configured. + + - If your drive is the only device on an IDE interface, it should + be jumpered as master, if at all possible. + + - If your IDE interface is not at the standard addresses of 0x170 + or 0x1f0, you'll need to explicitly inform the driver using a + lilo option. See Documentation/ide/ide.txt. (This feature was + added around kernel version 1.3.30.) + + - If the autoprobing is not finding your drive, you can tell the + driver to assume that one exists by using a lilo option of the + form `hdX=cdrom`, where X is the drive letter corresponding to + where your drive is installed. Note that if you do this and you + see a boot message like:: + + hdX: ATAPI cdrom (?) + + this does _not_ mean that the driver has successfully detected + the drive; rather, it means that the driver has not detected a + drive, but is assuming there's one there anyway because you told + it so. If you actually try to do I/O to a drive defined at a + nonexistent or nonresponding I/O address, you'll probably get + errors with a status value of 0xff. + + - Some IDE adapters require a nonstandard initialization sequence + before they'll function properly. (If this is the case, there + will often be a separate MS-DOS driver just for the controller.) + IDE interfaces on sound cards often fall into this category. + + Support for some interfaces needing extra initialization is + provided in later 1.3.x kernels. You may need to turn on + additional kernel configuration options to get them to work; + see Documentation/ide/ide.txt. + + Even if support is not available for your interface, you may be + able to get it to work with the following procedure. First boot + MS-DOS and load the appropriate drivers. Then warm-boot linux + (i.e., without powering off). If this works, it can be automated + by running loadlin from the MS-DOS autoexec. + + +b. Timeout/IRQ errors. + + - If you always get timeout errors, interrupts from the drive are + probably not making it to the host. + + - IRQ problems may also be indicated by the message + `IRQ probe failed ()` while booting. If is zero, that + means that the system did not see an interrupt from the drive when + it was expecting one (on any feasible IRQ). If is negative, + that means the system saw interrupts on multiple IRQ lines, when + it was expecting to receive just one from the CDROM drive. + + - Double-check your hardware configuration to make sure that the IRQ + number of your IDE interface matches what the driver expects. + (The usual assignments are 14 for the primary (0x1f0) interface + and 15 for the secondary (0x170) interface.) Also be sure that + you don't have some other hardware which might be conflicting with + the IRQ you're using. Also check the BIOS setup for your system; + some have the ability to disable individual IRQ levels, and I've + had one report of a system which was shipped with IRQ 15 disabled + by default. + + - Note that many MS-DOS CDROM drivers will still function even if + there are hardware problems with the interrupt setup; they + apparently don't use interrupts. + + - If you own a Pioneer DR-A24X, you _will_ get nasty error messages + on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }" + The Pioneer DR-A24X CDROM drives are fairly popular these days. + Unfortunately, these drives seem to become very confused when we perform + the standard Linux ATA disk drive probe. If you own one of these drives, + you can bypass the ATA probing which confuses these CDROM drives, by + adding `append="hdX=noprobe hdX=cdrom"` to your lilo.conf file and running + lilo (again where X is the drive letter corresponding to where your drive + is installed.) + +c. System hangups. + + - If the system locks up when you try to access the CDROM, the most + likely cause is that you have a buggy IDE adapter which doesn't + properly handle simultaneous transactions on multiple interfaces. + The most notorious of these is the CMD640B chip. This problem can + be worked around by specifying the `serialize` option when + booting. Recent kernels should be able to detect the need for + this automatically in most cases, but the detection is not + foolproof. See Documentation/ide/ide.txt for more information + about the `serialize` option and the CMD640B. + + - Note that many MS-DOS CDROM drivers will work with such buggy + hardware, apparently because they never attempt to overlap CDROM + operations with other disk activity. + + +d. Can't mount a CDROM. + + - If you get errors from mount, it may help to check `dmesg` to see + if there are any more specific errors from the driver or from the + filesystem. + + - Make sure there's a CDROM loaded in the drive, and that's it's an + ISO 9660 disc. You can't mount an audio CD. + + - With the CDROM in the drive and unmounted, try something like:: + + cat /dev/cdrom | od | more + + If you see a dump, then the drive and driver are probably working + OK, and the problem is at the filesystem level (i.e., the CDROM is + not ISO 9660 or has errors in the filesystem structure). + + - If you see `not a block device` errors, check that the definitions + of the device special files are correct. They should be as + follows:: + + brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda + brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb + brw-rw---- 1 root disk 22, 0 Nov 11 18:48 /dev/hdc + brw-rw---- 1 root disk 22, 64 Nov 11 18:48 /dev/hdd + + Some early Slackware releases had these defined incorrectly. If + these are wrong, you can remake them by running the script + scripts/MAKEDEV.ide. (You may have to make it executable + with chmod first.) + + If you have a /dev/cdrom symbolic link, check that it is pointing + to the correct device file. + + If you hear people talking of the devices `hd1a` and `hd1b`, these + were old names for what are now called hdc and hdd. Those names + should be considered obsolete. + + - If mount is complaining that the iso9660 filesystem is not + available, but you know it is (check /proc/filesystems), you + probably need a newer version of mount. Early versions would not + always give meaningful error messages. + + +e. Directory listings are unpredictably truncated, and `dmesg` shows + `buffer botch` error messages from the driver. + + - There was a bug in the version of the driver in 1.2.x kernels + which could cause this. It was fixed in 1.3.0. If you can't + upgrade, you can probably work around the problem by specifying a + blocksize of 2048 when mounting. (Note that you won't be able to + directly execute binaries off the CDROM in that case.) + + If you see this in kernels later than 1.3.0, please report it as a + bug. + + +f. Data corruption. + + - Random data corruption was occasionally observed with the Hitachi + CDR-7730 CDROM. If you experience data corruption, using "hdx=slow" + as a command line parameter may work around the problem, at the + expense of low system performance. + + +5. cdchange.c +------------- + +:: + + /* + * cdchange.c [-v] [] + * + * This loads a CDROM from a specified slot in a changer, and displays + * information about the changer status. The drive should be unmounted before + * using this program. + * + * Changer information is displayed if either the -v flag is specified + * or no slot was specified. + * + * Based on code originally from Gerhard Zuber . + * Changer status information, and rewrite for the new Uniform CDROM driver + * interface by Erik Andersen . + */ + + #include + #include + #include + #include + #include + #include + #include + #include + + + int + main (int argc, char **argv) + { + char *program; + char *device; + int fd; /* file descriptor for CD-ROM device */ + int status; /* return status for system calls */ + int verbose = 0; + int slot=-1, x_slot; + int total_slots_available; + + program = argv[0]; + + ++argv; + --argc; + + if (argc < 1 || argc > 3) { + fprintf (stderr, "usage: %s [-v] []\n", + program); + fprintf (stderr, " Slots are numbered 1 -- n.\n"); + exit (1); + } + + if (strcmp (argv[0], "-v") == 0) { + verbose = 1; + ++argv; + --argc; + } + + device = argv[0]; + + if (argc == 2) + slot = atoi (argv[1]) - 1; + + /* open device */ + fd = open(device, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + fprintf (stderr, "%s: open failed for `%s`: %s\n", + program, device, strerror (errno)); + exit (1); + } + + /* Check CD player status */ + total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS); + if (total_slots_available <= 1 ) { + fprintf (stderr, "%s: Device `%s` is not an ATAPI " + "compliant CD changer.\n", program, device); + exit (1); + } + + if (slot >= 0) { + if (slot >= total_slots_available) { + fprintf (stderr, "Bad slot number. " + "Should be 1 -- %d.\n", + total_slots_available); + exit (1); + } + + /* load */ + slot=ioctl (fd, CDROM_SELECT_DISC, slot); + if (slot<0) { + fflush(stdout); + perror ("CDROM_SELECT_DISC "); + exit(1); + } + } + + if (slot < 0 || verbose) { + + status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT); + if (status<0) { + fflush(stdout); + perror (" CDROM_SELECT_DISC"); + exit(1); + } + slot=status; + + printf ("Current slot: %d\n", slot+1); + printf ("Total slots available: %d\n", + total_slots_available); + + printf ("Drive status: "); + status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (status<0) { + perror(" CDROM_DRIVE_STATUS"); + } else switch(status) { + case CDS_DISC_OK: + printf ("Ready.\n"); + break; + case CDS_TRAY_OPEN: + printf ("Tray Open.\n"); + break; + case CDS_DRIVE_NOT_READY: + printf ("Drive Not Ready.\n"); + break; + default: + printf ("This Should not happen!\n"); + break; + } + + for (x_slot=0; x_slot= +2KB on such a disc. For example, it should be possible to do:: + + # dvd+rw-format /dev/hdc (only needed if the disc has never + been formatted) + # mkudffs /dev/hdc + # mount /dev/hdc /cdrom -t udf -o rw,noatime + +However, some drives don't follow the specification and expect the +host to perform aligned writes at 32KB boundaries. Other drives do +follow the specification, but suffer bad performance problems if the +writes are not 32KB aligned. + +Both problems can be solved by using the pktcdvd driver, which always +generates aligned writes:: + + # dvd+rw-format /dev/hdc + # pktsetup dev_name /dev/hdc + # mkudffs /dev/pktcdvd/dev_name + # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime + + +Packet writing for DVD-RAM media +-------------------------------- + +DVD-RAM discs are random writable, so using the pktcdvd driver is not +necessary. However, using the pktcdvd driver can improve performance +in the same way it does for DVD+RW media. + + +Notes +----- + +- CD-RW media can usually not be overwritten more than about 1000 + times, so to avoid unnecessary wear on the media, you should always + use the noatime mount option. + +- Defect management (ie automatic remapping of bad sectors) has not + been implemented yet, so you are likely to get at least some + filesystem corruption if the disc wears out. + +- Since the pktcdvd driver makes the disc appear as a regular block + device with a 2KB block size, you can put any filesystem you like on + the disc. For example, run:: + + # /sbin/mke2fs /dev/pktcdvd/dev_name + + to create an ext2 filesystem on the disc. + + +Using the pktcdvd sysfs interface +--------------------------------- + +Since Linux 2.6.20, the pktcdvd module has a sysfs interface +and can be controlled by it. For example the "pktcdvd" tool uses +this interface. (see http://tom.ist-im-web.de/download/pktcdvd ) + +"pktcdvd" works similar to "pktsetup", e.g.:: + + # pktcdvd -a dev_name /dev/hdc + # mkudffs /dev/pktcdvd/dev_name + # mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram + # cp files /dvdram + # umount /dvdram + # pktcdvd -r dev_name + + +For a description of the sysfs interface look into the file: + + Documentation/ABI/testing/sysfs-class-pktcdvd + + +Using the pktcdvd debugfs interface +----------------------------------- + +To read pktcdvd device infos in human readable form, do:: + + # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info + +For a description of the debugfs interface look into the file: + + Documentation/ABI/testing/debugfs-pktcdvd + + + +Links +----- + +See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information +about DVD writing. diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt deleted file mode 100644 index 2834170d821e..000000000000 --- a/Documentation/cdrom/packet-writing.txt +++ /dev/null @@ -1,132 +0,0 @@ -Getting started quick ---------------------- - -- Select packet support in the block device section and UDF support in - the file system section. - -- Compile and install kernel and modules, reboot. - -- You need the udftools package (pktsetup, mkudffs, cdrwtool). - Download from http://sourceforge.net/projects/linux-udf/ - -- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute - as appropriate): - # cdrwtool -d /dev/hdc -q - -- Setup your writer - # pktsetup dev_name /dev/hdc - -- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy! - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD-RW media -------------------------------- - -DVD-RW discs can be written to much like CD-RW discs if they are in -the so called "restricted overwrite" mode. To put a disc in restricted -overwrite mode, run: - - # dvd+rw-format /dev/hdc - -You can then use the disc the same way you would use a CD-RW disc: - - # pktsetup dev_name /dev/hdc - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD+RW media -------------------------------- - -According to the DVD+RW specification, a drive supporting DVD+RW discs -shall implement "true random writes with 2KB granularity", which means -that it should be possible to put any filesystem with a block size >= -2KB on such a disc. For example, it should be possible to do: - - # dvd+rw-format /dev/hdc (only needed if the disc has never - been formatted) - # mkudffs /dev/hdc - # mount /dev/hdc /cdrom -t udf -o rw,noatime - -However, some drives don't follow the specification and expect the -host to perform aligned writes at 32KB boundaries. Other drives do -follow the specification, but suffer bad performance problems if the -writes are not 32KB aligned. - -Both problems can be solved by using the pktcdvd driver, which always -generates aligned writes. - - # dvd+rw-format /dev/hdc - # pktsetup dev_name /dev/hdc - # mkudffs /dev/pktcdvd/dev_name - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD-RAM media --------------------------------- - -DVD-RAM discs are random writable, so using the pktcdvd driver is not -necessary. However, using the pktcdvd driver can improve performance -in the same way it does for DVD+RW media. - - -Notes ------ - -- CD-RW media can usually not be overwritten more than about 1000 - times, so to avoid unnecessary wear on the media, you should always - use the noatime mount option. - -- Defect management (ie automatic remapping of bad sectors) has not - been implemented yet, so you are likely to get at least some - filesystem corruption if the disc wears out. - -- Since the pktcdvd driver makes the disc appear as a regular block - device with a 2KB block size, you can put any filesystem you like on - the disc. For example, run: - - # /sbin/mke2fs /dev/pktcdvd/dev_name - - to create an ext2 filesystem on the disc. - - -Using the pktcdvd sysfs interface ---------------------------------- - -Since Linux 2.6.20, the pktcdvd module has a sysfs interface -and can be controlled by it. For example the "pktcdvd" tool uses -this interface. (see http://tom.ist-im-web.de/download/pktcdvd ) - -"pktcdvd" works similar to "pktsetup", e.g.: - - # pktcdvd -a dev_name /dev/hdc - # mkudffs /dev/pktcdvd/dev_name - # mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram - # cp files /dvdram - # umount /dvdram - # pktcdvd -r dev_name - - -For a description of the sysfs interface look into the file: - - Documentation/ABI/testing/sysfs-class-pktcdvd - - -Using the pktcdvd debugfs interface ------------------------------------ - -To read pktcdvd device infos in human readable form, do: - - # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info - -For a description of the debugfs interface look into the file: - - Documentation/ABI/testing/debugfs-pktcdvd - - - -Links ------ - -See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information -about DVD writing. -- cgit From f0ba43774cea3fc14732bb9243ce7238ae8a3202 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:43 -0300 Subject: docs: convert docs to ReST and rename to *.rst The conversion is actually: - add blank lines and indentation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Bjorn Helgaas Acked-by: Mark Brown Signed-off-by: Jonathan Corbet --- Documentation/device-mapper/cache-policies.rst | 131 +++++++ Documentation/device-mapper/cache-policies.txt | 121 ------ Documentation/device-mapper/cache.rst | 337 +++++++++++++++++ Documentation/device-mapper/cache.txt | 311 ---------------- Documentation/device-mapper/delay.rst | 31 ++ Documentation/device-mapper/delay.txt | 28 -- Documentation/device-mapper/dm-crypt.rst | 173 +++++++++ Documentation/device-mapper/dm-crypt.txt | 162 -------- Documentation/device-mapper/dm-flakey.rst | 74 ++++ Documentation/device-mapper/dm-flakey.txt | 57 --- Documentation/device-mapper/dm-init.rst | 125 +++++++ Documentation/device-mapper/dm-init.txt | 114 ------ Documentation/device-mapper/dm-integrity.rst | 259 +++++++++++++ Documentation/device-mapper/dm-integrity.txt | 233 ------------ Documentation/device-mapper/dm-io.rst | 75 ++++ Documentation/device-mapper/dm-io.txt | 75 ---- Documentation/device-mapper/dm-log.rst | 57 +++ Documentation/device-mapper/dm-log.txt | 54 --- Documentation/device-mapper/dm-queue-length.rst | 48 +++ Documentation/device-mapper/dm-queue-length.txt | 39 -- Documentation/device-mapper/dm-raid.rst | 419 +++++++++++++++++++++ Documentation/device-mapper/dm-raid.txt | 354 ------------------ Documentation/device-mapper/dm-service-time.rst | 101 +++++ Documentation/device-mapper/dm-service-time.txt | 91 ----- Documentation/device-mapper/dm-uevent.rst | 110 ++++++ Documentation/device-mapper/dm-uevent.txt | 97 ----- Documentation/device-mapper/dm-zoned.rst | 146 ++++++++ Documentation/device-mapper/dm-zoned.txt | 144 -------- Documentation/device-mapper/era.rst | 116 ++++++ Documentation/device-mapper/era.txt | 108 ------ Documentation/device-mapper/index.rst | 44 +++ Documentation/device-mapper/kcopyd.rst | 47 +++ Documentation/device-mapper/kcopyd.txt | 47 --- Documentation/device-mapper/linear.rst | 63 ++++ Documentation/device-mapper/linear.txt | 61 ---- Documentation/device-mapper/log-writes.rst | 145 ++++++++ Documentation/device-mapper/log-writes.txt | 140 ------- Documentation/device-mapper/persistent-data.rst | 88 +++++ Documentation/device-mapper/persistent-data.txt | 84 ----- Documentation/device-mapper/snapshot.rst | 180 +++++++++ Documentation/device-mapper/snapshot.txt | 176 --------- Documentation/device-mapper/statistics.rst | 225 ++++++++++++ Documentation/device-mapper/statistics.txt | 223 ----------- Documentation/device-mapper/striped.rst | 61 ++++ Documentation/device-mapper/striped.txt | 57 --- Documentation/device-mapper/switch.rst | 141 +++++++ Documentation/device-mapper/switch.txt | 138 ------- Documentation/device-mapper/thin-provisioning.rst | 427 ++++++++++++++++++++++ Documentation/device-mapper/thin-provisioning.txt | 411 --------------------- Documentation/device-mapper/unstriped.rst | 135 +++++++ Documentation/device-mapper/unstriped.txt | 124 ------- Documentation/device-mapper/verity.rst | 229 ++++++++++++ Documentation/device-mapper/verity.txt | 219 ----------- Documentation/device-mapper/writecache.rst | 79 ++++ Documentation/device-mapper/writecache.txt | 70 ---- Documentation/device-mapper/zero.rst | 37 ++ Documentation/device-mapper/zero.txt | 37 -- Documentation/filesystems/ubifs-authentication.md | 4 +- 58 files changed, 4105 insertions(+), 3777 deletions(-) create mode 100644 Documentation/device-mapper/cache-policies.rst delete mode 100644 Documentation/device-mapper/cache-policies.txt create mode 100644 Documentation/device-mapper/cache.rst delete mode 100644 Documentation/device-mapper/cache.txt create mode 100644 Documentation/device-mapper/delay.rst delete mode 100644 Documentation/device-mapper/delay.txt create mode 100644 Documentation/device-mapper/dm-crypt.rst delete mode 100644 Documentation/device-mapper/dm-crypt.txt create mode 100644 Documentation/device-mapper/dm-flakey.rst delete mode 100644 Documentation/device-mapper/dm-flakey.txt create mode 100644 Documentation/device-mapper/dm-init.rst delete mode 100644 Documentation/device-mapper/dm-init.txt create mode 100644 Documentation/device-mapper/dm-integrity.rst delete mode 100644 Documentation/device-mapper/dm-integrity.txt create mode 100644 Documentation/device-mapper/dm-io.rst delete mode 100644 Documentation/device-mapper/dm-io.txt create mode 100644 Documentation/device-mapper/dm-log.rst delete mode 100644 Documentation/device-mapper/dm-log.txt create mode 100644 Documentation/device-mapper/dm-queue-length.rst delete mode 100644 Documentation/device-mapper/dm-queue-length.txt create mode 100644 Documentation/device-mapper/dm-raid.rst delete mode 100644 Documentation/device-mapper/dm-raid.txt create mode 100644 Documentation/device-mapper/dm-service-time.rst delete mode 100644 Documentation/device-mapper/dm-service-time.txt create mode 100644 Documentation/device-mapper/dm-uevent.rst delete mode 100644 Documentation/device-mapper/dm-uevent.txt create mode 100644 Documentation/device-mapper/dm-zoned.rst delete mode 100644 Documentation/device-mapper/dm-zoned.txt create mode 100644 Documentation/device-mapper/era.rst delete mode 100644 Documentation/device-mapper/era.txt create mode 100644 Documentation/device-mapper/index.rst create mode 100644 Documentation/device-mapper/kcopyd.rst delete mode 100644 Documentation/device-mapper/kcopyd.txt create mode 100644 Documentation/device-mapper/linear.rst delete mode 100644 Documentation/device-mapper/linear.txt create mode 100644 Documentation/device-mapper/log-writes.rst delete mode 100644 Documentation/device-mapper/log-writes.txt create mode 100644 Documentation/device-mapper/persistent-data.rst delete mode 100644 Documentation/device-mapper/persistent-data.txt create mode 100644 Documentation/device-mapper/snapshot.rst delete mode 100644 Documentation/device-mapper/snapshot.txt create mode 100644 Documentation/device-mapper/statistics.rst delete mode 100644 Documentation/device-mapper/statistics.txt create mode 100644 Documentation/device-mapper/striped.rst delete mode 100644 Documentation/device-mapper/striped.txt create mode 100644 Documentation/device-mapper/switch.rst delete mode 100644 Documentation/device-mapper/switch.txt create mode 100644 Documentation/device-mapper/thin-provisioning.rst delete mode 100644 Documentation/device-mapper/thin-provisioning.txt create mode 100644 Documentation/device-mapper/unstriped.rst delete mode 100644 Documentation/device-mapper/unstriped.txt create mode 100644 Documentation/device-mapper/verity.rst delete mode 100644 Documentation/device-mapper/verity.txt create mode 100644 Documentation/device-mapper/writecache.rst delete mode 100644 Documentation/device-mapper/writecache.txt create mode 100644 Documentation/device-mapper/zero.rst delete mode 100644 Documentation/device-mapper/zero.txt (limited to 'Documentation') diff --git a/Documentation/device-mapper/cache-policies.rst b/Documentation/device-mapper/cache-policies.rst new file mode 100644 index 000000000000..b17fe352fc41 --- /dev/null +++ b/Documentation/device-mapper/cache-policies.rst @@ -0,0 +1,131 @@ +============================= +Guidance for writing policies +============================= + +Try to keep transactionality out of it. The core is careful to +avoid asking about anything that is migrating. This is a pain, but +makes it easier to write the policies. + +Mappings are loaded into the policy at construction time. + +Every bio that is mapped by the target is referred to the policy. +The policy can return a simple HIT or MISS or issue a migration. + +Currently there's no way for the policy to issue background work, +e.g. to start writing back dirty blocks that are going to be evicted +soon. + +Because we map bios, rather than requests it's easy for the policy +to get fooled by many small bios. For this reason the core target +issues periodic ticks to the policy. It's suggested that the policy +doesn't update states (eg, hit counts) for a block more than once +for each tick. The core ticks by watching bios complete, and so +trying to see when the io scheduler has let the ios run. + + +Overview of supplied cache replacement policies +=============================================== + +multiqueue (mq) +--------------- + +This policy is now an alias for smq (see below). + +The following tunables are accepted, but have no effect:: + + 'sequential_threshold <#nr_sequential_ios>' + 'random_threshold <#nr_random_ios>' + 'read_promote_adjustment ' + 'write_promote_adjustment ' + 'discard_promote_adjustment ' + +Stochastic multiqueue (smq) +--------------------------- + +This policy is the default. + +The stochastic multi-queue (smq) policy addresses some of the problems +with the multiqueue (mq) policy. + +The smq policy (vs mq) offers the promise of less memory utilization, +improved performance and increased adaptability in the face of changing +workloads. smq also does not have any cumbersome tuning knobs. + +Users may switch from "mq" to "smq" simply by appropriately reloading a +DM table that is using the cache target. Doing so will cause all of the +mq policy's hints to be dropped. Also, performance of the cache may +degrade slightly until smq recalculates the origin device's hotspots +that should be cached. + +Memory usage +^^^^^^^^^^^^ + +The mq policy used a lot of memory; 88 bytes per cache block on a 64 +bit machine. + +smq uses 28bit indexes to implement its data structures rather than +pointers. It avoids storing an explicit hit count for each block. It +has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of +the entries (each hotspot block covers a larger area than a single +cache block). + +All this means smq uses ~25bytes per cache block. Still a lot of +memory, but a substantial improvement nontheless. + +Level balancing +^^^^^^^^^^^^^^^ + +mq placed entries in different levels of the multiqueue structures +based on their hit count (~ln(hit count)). This meant the bottom +levels generally had the most entries, and the top ones had very +few. Having unbalanced levels like this reduced the efficacy of the +multiqueue. + +smq does not maintain a hit count, instead it swaps hit entries with +the least recently used entry from the level above. The overall +ordering being a side effect of this stochastic process. With this +scheme we can decide how many entries occupy each multiqueue level, +resulting in better promotion/demotion decisions. + +Adaptability: +The mq policy maintained a hit count for each cache block. For a +different block to get promoted to the cache its hit count has to +exceed the lowest currently in the cache. This meant it could take a +long time for the cache to adapt between varying IO patterns. + +smq doesn't maintain hit counts, so a lot of this problem just goes +away. In addition it tracks performance of the hotspot queue, which +is used to decide which blocks to promote. If the hotspot queue is +performing badly then it starts moving entries more quickly between +levels. This lets it adapt to new IO patterns very quickly. + +Performance +^^^^^^^^^^^ + +Testing smq shows substantially better performance than mq. + +cleaner +------- + +The cleaner writes back all dirty blocks in a cache to decommission it. + +Examples +======== + +The syntax for a table is:: + + cache + <#feature_args> []* + <#policy_args> []* + +The syntax to send a message using the dmsetup command is:: + + dmsetup message 0 sequential_threshold 1024 + dmsetup message 0 random_threshold 8 + +Using dmsetup:: + + dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ + /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" + creates a 128GB large mapped device named 'blah' with the + sequential threshold set to 1024 and the random_threshold set to 8. diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt deleted file mode 100644 index 86786d87d9a8..000000000000 --- a/Documentation/device-mapper/cache-policies.txt +++ /dev/null @@ -1,121 +0,0 @@ -Guidance for writing policies -============================= - -Try to keep transactionality out of it. The core is careful to -avoid asking about anything that is migrating. This is a pain, but -makes it easier to write the policies. - -Mappings are loaded into the policy at construction time. - -Every bio that is mapped by the target is referred to the policy. -The policy can return a simple HIT or MISS or issue a migration. - -Currently there's no way for the policy to issue background work, -e.g. to start writing back dirty blocks that are going to be evicted -soon. - -Because we map bios, rather than requests it's easy for the policy -to get fooled by many small bios. For this reason the core target -issues periodic ticks to the policy. It's suggested that the policy -doesn't update states (eg, hit counts) for a block more than once -for each tick. The core ticks by watching bios complete, and so -trying to see when the io scheduler has let the ios run. - - -Overview of supplied cache replacement policies -=============================================== - -multiqueue (mq) ---------------- - -This policy is now an alias for smq (see below). - -The following tunables are accepted, but have no effect: - - 'sequential_threshold <#nr_sequential_ios>' - 'random_threshold <#nr_random_ios>' - 'read_promote_adjustment ' - 'write_promote_adjustment ' - 'discard_promote_adjustment ' - -Stochastic multiqueue (smq) ---------------------------- - -This policy is the default. - -The stochastic multi-queue (smq) policy addresses some of the problems -with the multiqueue (mq) policy. - -The smq policy (vs mq) offers the promise of less memory utilization, -improved performance and increased adaptability in the face of changing -workloads. smq also does not have any cumbersome tuning knobs. - -Users may switch from "mq" to "smq" simply by appropriately reloading a -DM table that is using the cache target. Doing so will cause all of the -mq policy's hints to be dropped. Also, performance of the cache may -degrade slightly until smq recalculates the origin device's hotspots -that should be cached. - -Memory usage: -The mq policy used a lot of memory; 88 bytes per cache block on a 64 -bit machine. - -smq uses 28bit indexes to implement its data structures rather than -pointers. It avoids storing an explicit hit count for each block. It -has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of -the entries (each hotspot block covers a larger area than a single -cache block). - -All this means smq uses ~25bytes per cache block. Still a lot of -memory, but a substantial improvement nontheless. - -Level balancing: -mq placed entries in different levels of the multiqueue structures -based on their hit count (~ln(hit count)). This meant the bottom -levels generally had the most entries, and the top ones had very -few. Having unbalanced levels like this reduced the efficacy of the -multiqueue. - -smq does not maintain a hit count, instead it swaps hit entries with -the least recently used entry from the level above. The overall -ordering being a side effect of this stochastic process. With this -scheme we can decide how many entries occupy each multiqueue level, -resulting in better promotion/demotion decisions. - -Adaptability: -The mq policy maintained a hit count for each cache block. For a -different block to get promoted to the cache its hit count has to -exceed the lowest currently in the cache. This meant it could take a -long time for the cache to adapt between varying IO patterns. - -smq doesn't maintain hit counts, so a lot of this problem just goes -away. In addition it tracks performance of the hotspot queue, which -is used to decide which blocks to promote. If the hotspot queue is -performing badly then it starts moving entries more quickly between -levels. This lets it adapt to new IO patterns very quickly. - -Performance: -Testing smq shows substantially better performance than mq. - -cleaner -------- - -The cleaner writes back all dirty blocks in a cache to decommission it. - -Examples -======== - -The syntax for a table is: - cache - <#feature_args> []* - <#policy_args> []* - -The syntax to send a message using the dmsetup command is: - dmsetup message 0 sequential_threshold 1024 - dmsetup message 0 random_threshold 8 - -Using dmsetup: - dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ - /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" - creates a 128GB large mapped device named 'blah' with the - sequential threshold set to 1024 and the random_threshold set to 8. diff --git a/Documentation/device-mapper/cache.rst b/Documentation/device-mapper/cache.rst new file mode 100644 index 000000000000..f15e5254d05b --- /dev/null +++ b/Documentation/device-mapper/cache.rst @@ -0,0 +1,337 @@ +===== +Cache +===== + +Introduction +============ + +dm-cache is a device mapper target written by Joe Thornber, Heinz +Mauelshagen, and Mike Snitzer. + +It aims to improve performance of a block device (eg, a spindle) by +dynamically migrating some of its data to a faster, smaller device +(eg, an SSD). + +This device-mapper solution allows us to insert this caching at +different levels of the dm stack, for instance above the data device for +a thin-provisioning pool. Caching solutions that are integrated more +closely with the virtual memory system should give better performance. + +The target reuses the metadata library used in the thin-provisioning +library. + +The decision as to what data to migrate and when is left to a plug-in +policy module. Several of these have been written as we experiment, +and we hope other people will contribute others for specific io +scenarios (eg. a vm image server). + +Glossary +======== + + Migration + Movement of the primary copy of a logical block from one + device to the other. + Promotion + Migration from slow device to fast device. + Demotion + Migration from fast device to slow device. + +The origin device always contains a copy of the logical block, which +may be out of date or kept in sync with the copy on the cache device +(depending on policy). + +Design +====== + +Sub-devices +----------- + +The target is constructed by passing three devices to it (along with +other parameters detailed later): + +1. An origin device - the big, slow one. + +2. A cache device - the small, fast one. + +3. A small metadata device - records which blocks are in the cache, + which are dirty, and extra hints for use by the policy object. + This information could be put on the cache device, but having it + separate allows the volume manager to configure it differently, + e.g. as a mirror for extra robustness. This metadata device may only + be used by a single cache device. + +Fixed block size +---------------- + +The origin is divided up into blocks of a fixed size. This block size +is configurable when you first create the cache. Typically we've been +using block sizes of 256KB - 1024KB. The block size must be between 64 +sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB). + +Having a fixed block size simplifies the target a lot. But it is +something of a compromise. For instance, a small part of a block may be +getting hit a lot, yet the whole block will be promoted to the cache. +So large block sizes are bad because they waste cache space. And small +block sizes are bad because they increase the amount of metadata (both +in core and on disk). + +Cache operating modes +--------------------- + +The cache has three operating modes: writeback, writethrough and +passthrough. + +If writeback, the default, is selected then a write to a block that is +cached will go only to the cache and the block will be marked dirty in +the metadata. + +If writethrough is selected then a write to a cached block will not +complete until it has hit both the origin and cache devices. Clean +blocks should remain clean. + +If passthrough is selected, useful when the cache contents are not known +to be coherent with the origin device, then all reads are served from +the origin device (all reads miss the cache) and all writes are +forwarded to the origin device; additionally, write hits cause cache +block invalidates. To enable passthrough mode the cache must be clean. +Passthrough mode allows a cache device to be activated without having to +worry about coherency. Coherency that exists is maintained, although +the cache will gradually cool as writes take place. If the coherency of +the cache can later be verified, or established through use of the +"invalidate_cblocks" message, the cache device can be transitioned to +writethrough or writeback mode while still warm. Otherwise, the cache +contents can be discarded prior to transitioning to the desired +operating mode. + +A simple cleaner policy is provided, which will clean (write back) all +dirty blocks in a cache. Useful for decommissioning a cache or when +shrinking a cache. Shrinking the cache's fast device requires all cache +blocks, in the area of the cache being removed, to be clean. If the +area being removed from the cache still contains dirty blocks the resize +will fail. Care must be taken to never reduce the volume used for the +cache's fast device until the cache is clean. This is of particular +importance if writeback mode is used. Writethrough and passthrough +modes already maintain a clean cache. Future support to partially clean +the cache, above a specified threshold, will allow for keeping the cache +warm and in writeback mode during resize. + +Migration throttling +-------------------- + +Migrating data between the origin and cache device uses bandwidth. +The user can set a throttle to prevent more than a certain amount of +migration occurring at any one time. Currently we're not taking any +account of normal io traffic going to the devices. More work needs +doing here to avoid migrating during those peak io moments. + +For the time being, a message "migration_threshold <#sectors>" +can be used to set the maximum number of sectors being migrated, +the default being 2048 sectors (1MB). + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second. This +means the cache behaves like a physical disk that has a volatile write +cache. If power is lost you may lose some recent writes. The metadata +should always be consistent in spite of any crash. + +The 'dirty' state for a cache block changes far too frequently for us +to keep updating it on the fly. So we treat it as a hint. In normal +operation it will be written when the dm device is suspended. If the +system crashes all cache blocks will be assumed dirty when restarted. + +Per-block policy hints +---------------------- + +Policy plug-ins can store a chunk of data per cache block. It's up to +the policy how big this chunk is, but it should be kept small. Like the +dirty flags this data is lost if there's a crash so a safe fallback +value should always be possible. + +Policy hints affect performance, not correctness. + +Policy messaging +---------------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. Refer to cache-policies.txt. + +Discard bitset resolution +------------------------- + +We can avoid copying data during migration if we know the block has +been discarded. A prime example of this is when mkfs discards the +whole block device. We store a bitset tracking the discard state of +blocks. However, we allow this bitset to have a different block size +from the cache blocks. This is because we need to track the discard +state for all of the origin device (compare with the dirty bitset +which is just for the smaller cache device). + +Target interface +================ + +Constructor +----------- + + :: + + cache + <#feature args> []* + <#policy args> [policy args]* + + ================ ======================================================= + metadata dev fast device holding the persistent metadata + cache dev fast device holding cached data blocks + origin dev slow device holding original data blocks + block size cache unit size in sectors + + #feature args number of feature arguments passed + feature args writethrough or passthrough (The default is writeback.) + + policy the replacement policy to use + #policy args an even number of arguments corresponding to + key/value pairs passed to the policy + policy args key/value pairs passed to the policy + E.g. 'sequential_threshold 1024' + See cache-policies.txt for details. + ================ ======================================================= + +Optional feature arguments are: + + + ==================== ======================================================== + writethrough write through caching that prohibits cache block + content from being different from origin block content. + Without this argument, the default behaviour is to write + back cache block contents later for performance reasons, + so they may differ from the corresponding origin blocks. + + passthrough a degraded mode useful for various cache coherency + situations (e.g., rolling back snapshots of + underlying storage). Reads and writes always go to + the origin. If a write goes to a cached origin + block, then the cache block is invalidated. + To enable passthrough mode the cache must be clean. + + metadata2 use version 2 of the metadata. This stores the dirty + bits in a separate btree, which improves speed of + shutting down the cache. + + no_discard_passdown disable passing down discards from the cache + to the origin's data device. + ==================== ======================================================== + +A policy called 'default' is always registered. This is an alias for +the policy we currently think is giving best all round performance. + +As the default policy could vary between kernels, if you are relying on +the characteristics of a specific policy, always request it by name. + +Status +------ + +:: + + <#used metadata blocks>/<#total metadata blocks> + <#used cache blocks>/<#total cache blocks> + <#read hits> <#read misses> <#write hits> <#write misses> + <#demotions> <#promotions> <#dirty> <#features> * + <#core args> * <#policy args> * + + + +========================= ===================================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +cache block size Configurable block size for the cache device + in sectors +#used cache blocks Number of blocks resident in the cache +#total cache blocks Total number of cache blocks +#read hits Number of times a READ bio has been mapped + to the cache +#read misses Number of times a READ bio has been mapped + to the origin +#write hits Number of times a WRITE bio has been mapped + to the cache +#write misses Number of times a WRITE bio has been + mapped to the origin +#demotions Number of times a block has been removed + from the cache +#promotions Number of times a block has been moved to + the cache +#dirty Number of blocks in the cache that differ + from the origin +#feature args Number of feature args to follow +feature args 'writethrough' (optional) +#core args Number of core arguments (must be even) +core args Key/value pairs for tuning the core + e.g. migration_threshold +policy name Name of the policy +#policy args Number of policy arguments to follow (must be even) +policy args Key/value pairs e.g. sequential_threshold +cache metadata mode ro if read-only, rw if read-write + + In serious cases where even a read-only mode is + deemed unsafe no further I/O will be permitted and + the status will just contain the string 'Fail'. + The userspace recovery tools should then be used. +needs_check 'needs_check' if set, '-' if not set + A metadata operation has failed, resulting in the + needs_check flag being set in the metadata's + superblock. The metadata device must be + deactivated and checked/repaired before the + cache can be made fully operational again. + '-' indicates needs_check is not set. +========================= ===================================================== + +Messages +-------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. (A sysfs interface would also be possible.) + +The message format is:: + + + +E.g.:: + + dmsetup message my_cache 0 sequential_threshold 1024 + + +Invalidation is removing an entry from the cache without writing it +back. Cache blocks can be invalidated via the invalidate_cblocks +message, which takes an arbitrary number of cblock ranges. Each cblock +range's end value is "one past the end", meaning 5-10 expresses a range +of values from 5 to 9. Each cblock must be expressed as a decimal +value, in the future a variant message that takes cblock ranges +expressed in hexadecimal may be needed to better support efficient +invalidation of larger caches. The cache must be in passthrough mode +when invalidate_cblocks is used:: + + invalidate_cblocks [|-]* + +E.g.:: + + dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 + +Examples +======== + +The test suite can be found here: + +https://github.com/jthornber/device-mapper-test-suite + +:: + + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ + mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt deleted file mode 100644 index 8ae1cf8e94da..000000000000 --- a/Documentation/device-mapper/cache.txt +++ /dev/null @@ -1,311 +0,0 @@ -Introduction -============ - -dm-cache is a device mapper target written by Joe Thornber, Heinz -Mauelshagen, and Mike Snitzer. - -It aims to improve performance of a block device (eg, a spindle) by -dynamically migrating some of its data to a faster, smaller device -(eg, an SSD). - -This device-mapper solution allows us to insert this caching at -different levels of the dm stack, for instance above the data device for -a thin-provisioning pool. Caching solutions that are integrated more -closely with the virtual memory system should give better performance. - -The target reuses the metadata library used in the thin-provisioning -library. - -The decision as to what data to migrate and when is left to a plug-in -policy module. Several of these have been written as we experiment, -and we hope other people will contribute others for specific io -scenarios (eg. a vm image server). - -Glossary -======== - - Migration - Movement of the primary copy of a logical block from one - device to the other. - Promotion - Migration from slow device to fast device. - Demotion - Migration from fast device to slow device. - -The origin device always contains a copy of the logical block, which -may be out of date or kept in sync with the copy on the cache device -(depending on policy). - -Design -====== - -Sub-devices ------------ - -The target is constructed by passing three devices to it (along with -other parameters detailed later): - -1. An origin device - the big, slow one. - -2. A cache device - the small, fast one. - -3. A small metadata device - records which blocks are in the cache, - which are dirty, and extra hints for use by the policy object. - This information could be put on the cache device, but having it - separate allows the volume manager to configure it differently, - e.g. as a mirror for extra robustness. This metadata device may only - be used by a single cache device. - -Fixed block size ----------------- - -The origin is divided up into blocks of a fixed size. This block size -is configurable when you first create the cache. Typically we've been -using block sizes of 256KB - 1024KB. The block size must be between 64 -sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB). - -Having a fixed block size simplifies the target a lot. But it is -something of a compromise. For instance, a small part of a block may be -getting hit a lot, yet the whole block will be promoted to the cache. -So large block sizes are bad because they waste cache space. And small -block sizes are bad because they increase the amount of metadata (both -in core and on disk). - -Cache operating modes ---------------------- - -The cache has three operating modes: writeback, writethrough and -passthrough. - -If writeback, the default, is selected then a write to a block that is -cached will go only to the cache and the block will be marked dirty in -the metadata. - -If writethrough is selected then a write to a cached block will not -complete until it has hit both the origin and cache devices. Clean -blocks should remain clean. - -If passthrough is selected, useful when the cache contents are not known -to be coherent with the origin device, then all reads are served from -the origin device (all reads miss the cache) and all writes are -forwarded to the origin device; additionally, write hits cause cache -block invalidates. To enable passthrough mode the cache must be clean. -Passthrough mode allows a cache device to be activated without having to -worry about coherency. Coherency that exists is maintained, although -the cache will gradually cool as writes take place. If the coherency of -the cache can later be verified, or established through use of the -"invalidate_cblocks" message, the cache device can be transitioned to -writethrough or writeback mode while still warm. Otherwise, the cache -contents can be discarded prior to transitioning to the desired -operating mode. - -A simple cleaner policy is provided, which will clean (write back) all -dirty blocks in a cache. Useful for decommissioning a cache or when -shrinking a cache. Shrinking the cache's fast device requires all cache -blocks, in the area of the cache being removed, to be clean. If the -area being removed from the cache still contains dirty blocks the resize -will fail. Care must be taken to never reduce the volume used for the -cache's fast device until the cache is clean. This is of particular -importance if writeback mode is used. Writethrough and passthrough -modes already maintain a clean cache. Future support to partially clean -the cache, above a specified threshold, will allow for keeping the cache -warm and in writeback mode during resize. - -Migration throttling --------------------- - -Migrating data between the origin and cache device uses bandwidth. -The user can set a throttle to prevent more than a certain amount of -migration occurring at any one time. Currently we're not taking any -account of normal io traffic going to the devices. More work needs -doing here to avoid migrating during those peak io moments. - -For the time being, a message "migration_threshold <#sectors>" -can be used to set the maximum number of sectors being migrated, -the default being 2048 sectors (1MB). - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the cache behaves like a physical disk that has a volatile write -cache. If power is lost you may lose some recent writes. The metadata -should always be consistent in spite of any crash. - -The 'dirty' state for a cache block changes far too frequently for us -to keep updating it on the fly. So we treat it as a hint. In normal -operation it will be written when the dm device is suspended. If the -system crashes all cache blocks will be assumed dirty when restarted. - -Per-block policy hints ----------------------- - -Policy plug-ins can store a chunk of data per cache block. It's up to -the policy how big this chunk is, but it should be kept small. Like the -dirty flags this data is lost if there's a crash so a safe fallback -value should always be possible. - -Policy hints affect performance, not correctness. - -Policy messaging ----------------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. Refer to cache-policies.txt. - -Discard bitset resolution -------------------------- - -We can avoid copying data during migration if we know the block has -been discarded. A prime example of this is when mkfs discards the -whole block device. We store a bitset tracking the discard state of -blocks. However, we allow this bitset to have a different block size -from the cache blocks. This is because we need to track the discard -state for all of the origin device (compare with the dirty bitset -which is just for the smaller cache device). - -Target interface -================ - -Constructor ------------ - - cache - <#feature args> []* - <#policy args> [policy args]* - - metadata dev : fast device holding the persistent metadata - cache dev : fast device holding cached data blocks - origin dev : slow device holding original data blocks - block size : cache unit size in sectors - - #feature args : number of feature arguments passed - feature args : writethrough or passthrough (The default is writeback.) - - policy : the replacement policy to use - #policy args : an even number of arguments corresponding to - key/value pairs passed to the policy - policy args : key/value pairs passed to the policy - E.g. 'sequential_threshold 1024' - See cache-policies.txt for details. - -Optional feature arguments are: - writethrough : write through caching that prohibits cache block - content from being different from origin block content. - Without this argument, the default behaviour is to write - back cache block contents later for performance reasons, - so they may differ from the corresponding origin blocks. - - passthrough : a degraded mode useful for various cache coherency - situations (e.g., rolling back snapshots of - underlying storage). Reads and writes always go to - the origin. If a write goes to a cached origin - block, then the cache block is invalidated. - To enable passthrough mode the cache must be clean. - - metadata2 : use version 2 of the metadata. This stores the dirty bits - in a separate btree, which improves speed of shutting - down the cache. - - no_discard_passdown : disable passing down discards from the cache - to the origin's data device. - -A policy called 'default' is always registered. This is an alias for -the policy we currently think is giving best all round performance. - -As the default policy could vary between kernels, if you are relying on -the characteristics of a specific policy, always request it by name. - -Status ------- - - <#used metadata blocks>/<#total metadata blocks> - <#used cache blocks>/<#total cache blocks> -<#read hits> <#read misses> <#write hits> <#write misses> -<#demotions> <#promotions> <#dirty> <#features> * -<#core args> * <#policy args> * - - -metadata block size : Fixed block size for each metadata block in - sectors -#used metadata blocks : Number of metadata blocks used -#total metadata blocks : Total number of metadata blocks -cache block size : Configurable block size for the cache device - in sectors -#used cache blocks : Number of blocks resident in the cache -#total cache blocks : Total number of cache blocks -#read hits : Number of times a READ bio has been mapped - to the cache -#read misses : Number of times a READ bio has been mapped - to the origin -#write hits : Number of times a WRITE bio has been mapped - to the cache -#write misses : Number of times a WRITE bio has been - mapped to the origin -#demotions : Number of times a block has been removed - from the cache -#promotions : Number of times a block has been moved to - the cache -#dirty : Number of blocks in the cache that differ - from the origin -#feature args : Number of feature args to follow -feature args : 'writethrough' (optional) -#core args : Number of core arguments (must be even) -core args : Key/value pairs for tuning the core - e.g. migration_threshold -policy name : Name of the policy -#policy args : Number of policy arguments to follow (must be even) -policy args : Key/value pairs e.g. sequential_threshold -cache metadata mode : ro if read-only, rw if read-write - In serious cases where even a read-only mode is deemed unsafe - no further I/O will be permitted and the status will just - contain the string 'Fail'. The userspace recovery tools - should then be used. -needs_check : 'needs_check' if set, '-' if not set - A metadata operation has failed, resulting in the needs_check - flag being set in the metadata's superblock. The metadata - device must be deactivated and checked/repaired before the - cache can be made fully operational again. '-' indicates - needs_check is not set. - -Messages --------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. (A sysfs interface would also be possible.) - -The message format is: - - - -E.g. - dmsetup message my_cache 0 sequential_threshold 1024 - - -Invalidation is removing an entry from the cache without writing it -back. Cache blocks can be invalidated via the invalidate_cblocks -message, which takes an arbitrary number of cblock ranges. Each cblock -range's end value is "one past the end", meaning 5-10 expresses a range -of values from 5 to 9. Each cblock must be expressed as a decimal -value, in the future a variant message that takes cblock ranges -expressed in hexadecimal may be needed to better support efficient -invalidation of larger caches. The cache must be in passthrough mode -when invalidate_cblocks is used. - - invalidate_cblocks [|-]* - -E.g. - dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 - -Examples -======== - -The test suite can be found here: - -https://github.com/jthornber/device-mapper-test-suite - -dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' -dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ - mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/device-mapper/delay.rst b/Documentation/device-mapper/delay.rst new file mode 100644 index 000000000000..917ba8c33359 --- /dev/null +++ b/Documentation/device-mapper/delay.rst @@ -0,0 +1,31 @@ +======== +dm-delay +======== + +Device-Mapper's "delay" target delays reads and/or writes +and maps them to different devices. + +Parameters:: + + [ + [ ]] + +With separate write parameters, the first set is only used for reads. +Offsets are specified in sectors. +Delays are specified in milliseconds. + +Example scripts +=============== + +:: + + #!/bin/sh + # Create device delaying rw operation for 500ms + echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed + +:: + + #!/bin/sh + # Create device delaying only write operation for 500ms and + # splitting reads and writes to different devices $1 $2 + echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt deleted file mode 100644 index 6426c45273cb..000000000000 --- a/Documentation/device-mapper/delay.txt +++ /dev/null @@ -1,28 +0,0 @@ -dm-delay -======== - -Device-Mapper's "delay" target delays reads and/or writes -and maps them to different devices. - -Parameters: - [ - [ ]] - -With separate write parameters, the first set is only used for reads. -Offsets are specified in sectors. -Delays are specified in milliseconds. - -Example scripts -=============== -[[ -#!/bin/sh -# Create device delaying rw operation for 500ms -echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed -]] - -[[ -#!/bin/sh -# Create device delaying only write operation for 500ms and -# splitting reads and writes to different devices $1 $2 -echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed -]] diff --git a/Documentation/device-mapper/dm-crypt.rst b/Documentation/device-mapper/dm-crypt.rst new file mode 100644 index 000000000000..8f4a3f889d43 --- /dev/null +++ b/Documentation/device-mapper/dm-crypt.rst @@ -0,0 +1,173 @@ +======== +dm-crypt +======== + +Device-Mapper's "crypt" target provides transparent encryption of block devices +using the kernel crypto API. + +For a more detailed description of supported parameters see: +https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt + +Parameters:: + + \ + [<#opt_params> ] + + + Encryption cipher, encryption mode and Initial Vector (IV) generator. + + The cipher specifications format is:: + + cipher[:keycount]-chainmode-ivmode[:ivopts] + + Examples:: + + aes-cbc-essiv:sha256 + aes-xts-plain64 + serpent-xts-plain64 + + Cipher format also supports direct specification with kernel crypt API + format (selected by capi: prefix). The IV specification is the same + as for the first format type. + This format is mainly used for specification of authenticated modes. + + The crypto API cipher specifications format is:: + + capi:cipher_api_spec-ivmode[:ivopts] + + Examples:: + + capi:cbc(aes)-essiv:sha256 + capi:xts(aes)-plain64 + + Examples of authenticated modes:: + + capi:gcm(aes)-random + capi:authenc(hmac(sha256),xts(aes))-random + capi:rfc7539(chacha20,poly1305)-random + + The /proc/crypto contains a list of curently loaded crypto modes. + + + Key used for encryption. It is encoded either as a hexadecimal number + or it can be passed as prefixed with single colon + character (':') for keys residing in kernel keyring service. + You can only use key sizes that are valid for the selected cipher + in combination with the selected iv mode. + Note that for some iv modes the key string can contain additional + keys (for example IV seed) so the key contains more parts concatenated + into a single string. + + + The kernel keyring key is identified by string in following format: + ::. + + + The encryption key size in bytes. The kernel key payload size must match + the value passed in . + + + Either 'logon' or 'user' kernel key type. + + + The kernel keyring key description crypt target should look for + when loading key of . + + + Multi-key compatibility mode. You can define keys and + then sectors are encrypted according to their offsets (sector 0 uses key0; + sector 1 uses key1 etc.). must be a power of two. + + + The IV offset is a sector count that is added to the sector number + before creating the IV. + + + This is the device that is going to be used as backend and contains the + encrypted data. You can specify it as a path like /dev/xxx or a device + number :. + + + Starting sector within the device where the encrypted data begins. + +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 3 allow_discards same_cpu_crypt submit_from_crypt_cpus + +allow_discards + Block discard requests (a.k.a. TRIM) are passed through the crypt device. + The default is to ignore discard requests. + + WARNING: Assess the specific security risks carefully before enabling this + option. For example, allowing discards on encrypted devices may lead to + the leak of information about the ciphertext device (filesystem type, + used space etc.) if the discarded blocks can be located easily on the + device later. + +same_cpu_crypt + Perform encryption using the same cpu that IO was submitted on. + The default is to use an unbound workqueue so that encryption work + is automatically balanced between available CPUs. + +submit_from_crypt_cpus + Disable offloading writes to a separate thread after encryption. + There are some situations where offloading write bios from the + encryption threads to a single thread degrades performance + significantly. The default is to offload write bios to the same + thread because it benefits CFQ to have writes submitted using the + same context. + +integrity:: + The device requires additional metadata per-sector stored + in per-bio integrity structure. This metadata must by provided + by underlying dm-integrity target. + + The can be "none" if metadata is used only for persistent IV. + + For Authenticated Encryption with Additional Data (AEAD) + the is "aead". An AEAD mode additionally calculates and verifies + integrity for the encrypted device. The additional space is then + used for storing authentication tag (and persistent IV if needed). + +sector_size: + Use as the encryption unit instead of 512 bytes sectors. + This option can be in range 512 - 4096 bytes and must be power of two. + Virtual device will announce this size as a minimal IO and logical sector. + +iv_large_sectors + IV generators will use sector number counted in units + instead of default 512 bytes sectors. + + For example, if is 4096 bytes, plain64 IV for the second + sector will be 8 (without flag) and 1 if iv_large_sectors is present. + The must be multiple of (in 512 bytes units) + if this flag is specified. + +Example scripts +=============== +LUKS (Linux Unified Key Setup) is now the preferred way to set up disk +encryption with dm-crypt using the 'cryptsetup' utility, see +https://gitlab.com/cryptsetup/cryptsetup + +:: + + #!/bin/sh + # Create a crypt device using dmsetup + dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using dmsetup when encryption key is stored in keyring service + dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using cryptsetup and LUKS header with default cipher + cryptsetup luksFormat $1 + cryptsetup luksOpen $1 crypt1 diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt deleted file mode 100644 index 3b3e1de21c9c..000000000000 --- a/Documentation/device-mapper/dm-crypt.txt +++ /dev/null @@ -1,162 +0,0 @@ -dm-crypt -========= - -Device-Mapper's "crypt" target provides transparent encryption of block devices -using the kernel crypto API. - -For a more detailed description of supported parameters see: -https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt - -Parameters: \ - [<#opt_params> ] - - - Encryption cipher, encryption mode and Initial Vector (IV) generator. - - The cipher specifications format is: - cipher[:keycount]-chainmode-ivmode[:ivopts] - Examples: - aes-cbc-essiv:sha256 - aes-xts-plain64 - serpent-xts-plain64 - - Cipher format also supports direct specification with kernel crypt API - format (selected by capi: prefix). The IV specification is the same - as for the first format type. - This format is mainly used for specification of authenticated modes. - - The crypto API cipher specifications format is: - capi:cipher_api_spec-ivmode[:ivopts] - Examples: - capi:cbc(aes)-essiv:sha256 - capi:xts(aes)-plain64 - Examples of authenticated modes: - capi:gcm(aes)-random - capi:authenc(hmac(sha256),xts(aes))-random - capi:rfc7539(chacha20,poly1305)-random - - The /proc/crypto contains a list of curently loaded crypto modes. - - - Key used for encryption. It is encoded either as a hexadecimal number - or it can be passed as prefixed with single colon - character (':') for keys residing in kernel keyring service. - You can only use key sizes that are valid for the selected cipher - in combination with the selected iv mode. - Note that for some iv modes the key string can contain additional - keys (for example IV seed) so the key contains more parts concatenated - into a single string. - - - The kernel keyring key is identified by string in following format: - ::. - - - The encryption key size in bytes. The kernel key payload size must match - the value passed in . - - - Either 'logon' or 'user' kernel key type. - - - The kernel keyring key description crypt target should look for - when loading key of . - - - Multi-key compatibility mode. You can define keys and - then sectors are encrypted according to their offsets (sector 0 uses key0; - sector 1 uses key1 etc.). must be a power of two. - - - The IV offset is a sector count that is added to the sector number - before creating the IV. - - - This is the device that is going to be used as backend and contains the - encrypted data. You can specify it as a path like /dev/xxx or a device - number :. - - - Starting sector within the device where the encrypted data begins. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 3 allow_discards same_cpu_crypt submit_from_crypt_cpus - -allow_discards - Block discard requests (a.k.a. TRIM) are passed through the crypt device. - The default is to ignore discard requests. - - WARNING: Assess the specific security risks carefully before enabling this - option. For example, allowing discards on encrypted devices may lead to - the leak of information about the ciphertext device (filesystem type, - used space etc.) if the discarded blocks can be located easily on the - device later. - -same_cpu_crypt - Perform encryption using the same cpu that IO was submitted on. - The default is to use an unbound workqueue so that encryption work - is automatically balanced between available CPUs. - -submit_from_crypt_cpus - Disable offloading writes to a separate thread after encryption. - There are some situations where offloading write bios from the - encryption threads to a single thread degrades performance - significantly. The default is to offload write bios to the same - thread because it benefits CFQ to have writes submitted using the - same context. - -integrity:: - The device requires additional metadata per-sector stored - in per-bio integrity structure. This metadata must by provided - by underlying dm-integrity target. - - The can be "none" if metadata is used only for persistent IV. - - For Authenticated Encryption with Additional Data (AEAD) - the is "aead". An AEAD mode additionally calculates and verifies - integrity for the encrypted device. The additional space is then - used for storing authentication tag (and persistent IV if needed). - -sector_size: - Use as the encryption unit instead of 512 bytes sectors. - This option can be in range 512 - 4096 bytes and must be power of two. - Virtual device will announce this size as a minimal IO and logical sector. - -iv_large_sectors - IV generators will use sector number counted in units - instead of default 512 bytes sectors. - - For example, if is 4096 bytes, plain64 IV for the second - sector will be 8 (without flag) and 1 if iv_large_sectors is present. - The must be multiple of (in 512 bytes units) - if this flag is specified. - -Example scripts -=============== -LUKS (Linux Unified Key Setup) is now the preferred way to set up disk -encryption with dm-crypt using the 'cryptsetup' utility, see -https://gitlab.com/cryptsetup/cryptsetup - -[[ -#!/bin/sh -# Create a crypt device using dmsetup -dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" -]] - -[[ -#!/bin/sh -# Create a crypt device using dmsetup when encryption key is stored in keyring service -dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" -]] - -[[ -#!/bin/sh -# Create a crypt device using cryptsetup and LUKS header with default cipher -cryptsetup luksFormat $1 -cryptsetup luksOpen $1 crypt1 -]] diff --git a/Documentation/device-mapper/dm-flakey.rst b/Documentation/device-mapper/dm-flakey.rst new file mode 100644 index 000000000000..86138735879d --- /dev/null +++ b/Documentation/device-mapper/dm-flakey.rst @@ -0,0 +1,74 @@ +========= +dm-flakey +========= + +This target is the same as the linear target except that it exhibits +unreliable behaviour periodically. It's been found useful in simulating +failing devices for testing purposes. + +Starting from the time the table is loaded, the device is available for + seconds, then exhibits unreliable behaviour for seconds, and then this cycle repeats. + +Also, consider using this in combination with the dm-delay target too, +which can delay reads and writes and/or send them to different +underlying devices. + +Table parameters +---------------- + +:: + + \ + [ []] + +Mandatory parameters: + + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + : + Number of seconds device is available. + : + Number of seconds device returns errors. + +Optional feature parameters: + + If no feature parameters are present, during the periods of + unreliability, all I/O returns errors. + + drop_writes: + All write I/O is silently ignored. + Read I/O is handled correctly. + + error_writes: + All write I/O is failed with an error signalled. + Read I/O is handled correctly. + + corrupt_bio_byte : + During , replace of the data of + each matching bio with . + + : + The offset of the byte to replace. + Counting starts at 1, to replace the first byte. + : + Either 'r' to corrupt reads or 'w' to corrupt writes. + 'w' is incompatible with drop_writes. + : + The value (from 0-255) to write. + : + Perform the replacement only if bio->bi_opf has all the + selected flags set. + +Examples: + +Replaces the 32nd byte of READ bios with the value 1:: + + corrupt_bio_byte 32 r 1 0 + +Replaces the 224th byte of REQ_META (=32) bios with the value 0:: + + corrupt_bio_byte 224 w 0 32 diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt deleted file mode 100644 index 9f0e247d0877..000000000000 --- a/Documentation/device-mapper/dm-flakey.txt +++ /dev/null @@ -1,57 +0,0 @@ -dm-flakey -========= - -This target is the same as the linear target except that it exhibits -unreliable behaviour periodically. It's been found useful in simulating -failing devices for testing purposes. - -Starting from the time the table is loaded, the device is available for - seconds, then exhibits unreliable behaviour for seconds, and then this cycle repeats. - -Also, consider using this in combination with the dm-delay target too, -which can delay reads and writes and/or send them to different -underlying devices. - -Table parameters ----------------- - \ - [ []] - -Mandatory parameters: - : Full pathname to the underlying block-device, or a - "major:minor" device-number. - : Starting sector within the device. - : Number of seconds device is available. - : Number of seconds device returns errors. - -Optional feature parameters: - If no feature parameters are present, during the periods of - unreliability, all I/O returns errors. - - drop_writes: - All write I/O is silently ignored. - Read I/O is handled correctly. - - error_writes: - All write I/O is failed with an error signalled. - Read I/O is handled correctly. - - corrupt_bio_byte : - During , replace of the data of - each matching bio with . - - : The offset of the byte to replace. - Counting starts at 1, to replace the first byte. - : Either 'r' to corrupt reads or 'w' to corrupt writes. - 'w' is incompatible with drop_writes. - : The value (from 0-255) to write. - : Perform the replacement only if bio->bi_opf has all the - selected flags set. - -Examples: - corrupt_bio_byte 32 r 1 0 - - replaces the 32nd byte of READ bios with the value 1 - - corrupt_bio_byte 224 w 0 32 - - replaces the 224th byte of REQ_META (=32) bios with the value 0 diff --git a/Documentation/device-mapper/dm-init.rst b/Documentation/device-mapper/dm-init.rst new file mode 100644 index 000000000000..e5242ff17e9b --- /dev/null +++ b/Documentation/device-mapper/dm-init.rst @@ -0,0 +1,125 @@ +================================ +Early creation of mapped devices +================================ + +It is possible to configure a device-mapper device to act as the root device for +your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal userspace +which configures the device, then pivot_root(8) in to it. + +The second is to create one or more device-mappers using the module parameter +"dm-mod.create=" through the kernel boot command line argument. + +The format is specified as a string of data separated by commas and optionally +semi-colons, where: + + - a comma is used to separate fields like name, uuid, flags and table + (specifies one device) + - a semi-colon is used to separate devices. + +So the format will look like this:: + + dm-mod.create=,,,,[,
+][;,,,,
[,
+]+] + +Where:: + + ::= The device name. + ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" + ::= The device minor number | "" + ::= "ro" | "rw" +
::= + ::= "verity" | "linear" | ... (see list below) + +The dm line should be equivalent to the one used by the dmsetup tool with the +`--concise` argument. + +Target types +============ + +Not all target types are available as there are serious risks in allowing +activation of certain DM targets without first using userspace tools to check +the validity of associated metadata. + +======================= ======================================================= +`cache` constrained, userspace should verify cache device +`crypt` allowed +`delay` allowed +`era` constrained, userspace should verify metadata device +`flakey` constrained, meant for test +`linear` allowed +`log-writes` constrained, userspace should verify metadata device +`mirror` constrained, userspace should verify main/mirror device +`raid` constrained, userspace should verify metadata device +`snapshot` constrained, userspace should verify src/dst device +`snapshot-origin` allowed +`snapshot-merge` constrained, userspace should verify src/dst device +`striped` allowed +`switch` constrained, userspace should verify dev path +`thin` constrained, requires dm target message from userspace +`thin-pool` constrained, requires dm target message from userspace +`verity` allowed +`writecache` constrained, userspace should verify cache device +`zero` constrained, not meant for rootfs +======================= ======================================================= + +If the target is not listed above, it is constrained by default (not tested). + +Examples +======== +An example of booting to a linear array made up of user-mode linux block +devices:: + + dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 + +This will boot to a rw dm-linear target of 8192 sectors split across two block +devices identified by their major:minor numbers. After boot, udev will rename +this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. + +An example of multiple device-mappers, with the dm-mod.create="..." contents +is shown here split on multiple lines for readability:: + + dm-linear,,1,rw, + 0 32768 linear 8:1 0, + 32768 1024000 linear 8:2 0; + dm-verity,,3,ro, + 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 + ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 + 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 + +Other examples (per target): + +"crypt":: + + dm-crypt,,8,ro, + 0 1048576 crypt aes-xts-plain64 + babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 + /dev/sda 0 1 allow_discards + +"delay":: + + dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 + +"linear":: + + dm-linear,,,rw, + 0 32768 linear /dev/sda1 0, + 32768 1024000 linear /dev/sda2 0, + 1056768 204800 linear /dev/sda3 0, + 1261568 512000 linear /dev/sda4 0 + +"snapshot-origin":: + + dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 + +"striped":: + + dm-striped,,4,ro,0 1638400 striped 4 4096 + /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 + +"verity":: + + dm-verity,,4,ro, + 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 + fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd + 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt deleted file mode 100644 index 130b3c3679c5..000000000000 --- a/Documentation/device-mapper/dm-init.txt +++ /dev/null @@ -1,114 +0,0 @@ -Early creation of mapped devices -==================================== - -It is possible to configure a device-mapper device to act as the root device for -your system in two ways. - -The first is to build an initial ramdisk which boots to a minimal userspace -which configures the device, then pivot_root(8) in to it. - -The second is to create one or more device-mappers using the module parameter -"dm-mod.create=" through the kernel boot command line argument. - -The format is specified as a string of data separated by commas and optionally -semi-colons, where: - - a comma is used to separate fields like name, uuid, flags and table - (specifies one device) - - a semi-colon is used to separate devices. - -So the format will look like this: - - dm-mod.create=,,,,
[,
+][;,,,,
[,
+]+] - -Where, - ::= The device name. - ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" - ::= The device minor number | "" - ::= "ro" | "rw" -
::= - ::= "verity" | "linear" | ... (see list below) - -The dm line should be equivalent to the one used by the dmsetup tool with the ---concise argument. - -Target types -============ - -Not all target types are available as there are serious risks in allowing -activation of certain DM targets without first using userspace tools to check -the validity of associated metadata. - - "cache": constrained, userspace should verify cache device - "crypt": allowed - "delay": allowed - "era": constrained, userspace should verify metadata device - "flakey": constrained, meant for test - "linear": allowed - "log-writes": constrained, userspace should verify metadata device - "mirror": constrained, userspace should verify main/mirror device - "raid": constrained, userspace should verify metadata device - "snapshot": constrained, userspace should verify src/dst device - "snapshot-origin": allowed - "snapshot-merge": constrained, userspace should verify src/dst device - "striped": allowed - "switch": constrained, userspace should verify dev path - "thin": constrained, requires dm target message from userspace - "thin-pool": constrained, requires dm target message from userspace - "verity": allowed - "writecache": constrained, userspace should verify cache device - "zero": constrained, not meant for rootfs - -If the target is not listed above, it is constrained by default (not tested). - -Examples -======== -An example of booting to a linear array made up of user-mode linux block -devices: - - dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 - -This will boot to a rw dm-linear target of 8192 sectors split across two block -devices identified by their major:minor numbers. After boot, udev will rename -this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. - -An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here -split on multiple lines for readability: - - dm-linear,,1,rw, - 0 32768 linear 8:1 0, - 32768 1024000 linear 8:2 0; - dm-verity,,3,ro, - 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 - ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 - 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 - -Other examples (per target): - -"crypt": - dm-crypt,,8,ro, - 0 1048576 crypt aes-xts-plain64 - babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 - /dev/sda 0 1 allow_discards - -"delay": - dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 - -"linear": - dm-linear,,,rw, - 0 32768 linear /dev/sda1 0, - 32768 1024000 linear /dev/sda2 0, - 1056768 204800 linear /dev/sda3 0, - 1261568 512000 linear /dev/sda4 0 - -"snapshot-origin": - dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 - -"striped": - dm-striped,,4,ro,0 1638400 striped 4 4096 - /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 - -"verity": - dm-verity,,4,ro, - 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 - fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd - 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/device-mapper/dm-integrity.rst b/Documentation/device-mapper/dm-integrity.rst new file mode 100644 index 000000000000..a30aa91b5fbe --- /dev/null +++ b/Documentation/device-mapper/dm-integrity.rst @@ -0,0 +1,259 @@ +============ +dm-integrity +============ + +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + +There's an alternate mode of operation where dm-integrity uses bitmap +instead of a journal. If a bit in the bitmap is 1, the corresponding +region's data and integrity tags are not synchronized - if the machine +crashes, the unsynchronized regions will be recalculated. The bitmap mode +is faster than the journal mode, because we don't have to write the data +twice, but it is also less reliable, because if data corruption happens +when the machine crashes, it may not be detected. + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: + +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + + D - direct writes (without journal) + in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes + data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + B - bitmap mode - data and metadata are written without any + synchronization, the driver maintains a bitmap of dirty + regions where data and metadata don't match. This mode can + only be used with internal hash. + R - recovery mode - in this mode, journal is not replayed, + checksums are not checked and writes to the device are not + allowed. This mode is useful for data recovery if the + device cannot be activated in any of the other standard + modes. + +5. the number of additional arguments + +Additional arguments: + +journal_sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave_sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +meta_device:device + Don't interleave the data and metadata on on device. Use a + separate device for metadata. + +buffer_sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal_watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit_time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal_hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +recalculate + Recalculate the integrity tags automatically. It is only valid + when using internal hash. + +journal_crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal_mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + +block_size:number + The size of a data block in bytes. The larger the block size the + less overhead there is for per-block integrity metadata. + Supported values are 512, 1024, 2048 and 4096 bytes. If not + specified the default block size is 512 bytes. + +sectors_per_bit:number + In the bitmap mode, this parameter specifies the number of + 512-byte sectors that corresponds to one bitmap bit. + +bitmap_flush_interval:number + The bitmap flush interval in milliseconds. The metadata buffers + are synchronized when this interval expires. + + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: + +* reserved sectors + (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments + +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags + SB_FLAG_HAVE_JOURNAL_MAC + - a flag is set if journal_mac is used + SB_FLAG_RECALCULATING + - recalculating is in progress + SB_FLAG_DIRTY_BITMAP + - journal area contains the bitmap of dirty + blocks + * log2(sectors per block) + * a position where recalculating finished +* journal + The journal is divided into sections, each section contains: + + * metadata area (4kiB), it contains journal entries + + - every journal entry contains: + + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + + - every metadata sector ends with + + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + + - every sector in the data area contains: + + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. + +* one or more runs of interleaved tags and data. + Each run contains: + + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt deleted file mode 100644 index d63d78ffeb73..000000000000 --- a/Documentation/device-mapper/dm-integrity.txt +++ /dev/null @@ -1,233 +0,0 @@ -The dm-integrity target emulates a block device that has additional -per-sector tags that can be used for storing integrity information. - -A general problem with storing integrity tags with every sector is that -writing the sector and the integrity tag must be atomic - i.e. in case of -crash, either both sector and integrity tag or none of them is written. - -To guarantee write atomicity, the dm-integrity target uses journal, it -writes sector data and integrity tags into a journal, commits the journal -and then copies the data and integrity tags to their respective location. - -The dm-integrity target can be used with the dm-crypt target - in this -situation the dm-crypt target creates the integrity data and passes them -to the dm-integrity target via bio_integrity_payload attached to the bio. -In this mode, the dm-crypt and dm-integrity targets provide authenticated -disk encryption - if the attacker modifies the encrypted device, an I/O -error is returned instead of random data. - -The dm-integrity target can also be used as a standalone target, in this -mode it calculates and verifies the integrity tag internally. In this -mode, the dm-integrity target can be used to detect silent data -corruption on the disk or in the I/O path. - -There's an alternate mode of operation where dm-integrity uses bitmap -instead of a journal. If a bit in the bitmap is 1, the corresponding -region's data and integrity tags are not synchronized - if the machine -crashes, the unsynchronized regions will be recalculated. The bitmap mode -is faster than the journal mode, because we don't have to write the data -twice, but it is also less reliable, because if data corruption happens -when the machine crashes, it may not be detected. - -When loading the target for the first time, the kernel driver will format -the device. But it will only format the device if the superblock contains -zeroes. If the superblock is neither valid nor zeroed, the dm-integrity -target can't be loaded. - -To use the target for the first time: -1. overwrite the superblock with zeroes -2. load the dm-integrity target with one-sector size, the kernel driver - will format the device -3. unload the dm-integrity target -4. read the "provided_data_sectors" value from the superblock -5. load the dm-integrity target with the the target size - "provided_data_sectors" -6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target - with the size "provided_data_sectors" - - -Target arguments: - -1. the underlying block device - -2. the number of reserved sector at the beginning of the device - the - dm-integrity won't read of write these sectors - -3. the size of the integrity tag (if "-" is used, the size is taken from - the internal-hash algorithm) - -4. mode: - D - direct writes (without journal) - in this mode, journaling is - not used and data sectors and integrity tags are written - separately. In case of crash, it is possible that the data - and integrity tag doesn't match. - J - journaled writes - data and integrity tags are written to the - journal and atomicity is guaranteed. In case of crash, - either both data and tag or none of them are written. The - journaled mode degrades write throughput twice because the - data have to be written twice. - B - bitmap mode - data and metadata are written without any - synchronization, the driver maintains a bitmap of dirty - regions where data and metadata don't match. This mode can - only be used with internal hash. - R - recovery mode - in this mode, journal is not replayed, - checksums are not checked and writes to the device are not - allowed. This mode is useful for data recovery if the - device cannot be activated in any of the other standard - modes. - -5. the number of additional arguments - -Additional arguments: - -journal_sectors:number - The size of journal, this argument is used only if formatting the - device. If the device is already formatted, the value from the - superblock is used. - -interleave_sectors:number - The number of interleaved sectors. This values is rounded down to - a power of two. If the device is already formatted, the value from - the superblock is used. - -meta_device:device - Don't interleave the data and metadata on on device. Use a - separate device for metadata. - -buffer_sectors:number - The number of sectors in one buffer. The value is rounded down to - a power of two. - - The tag area is accessed using buffers, the buffer size is - configurable. The large buffer size means that the I/O size will - be larger, but there could be less I/Os issued. - -journal_watermark:number - The journal watermark in percents. When the size of the journal - exceeds this watermark, the thread that flushes the journal will - be started. - -commit_time:number - Commit time in milliseconds. When this time passes, the journal is - written. The journal is also written immediatelly if the FLUSH - request is received. - -internal_hash:algorithm(:key) (the key is optional) - Use internal hash or crc. - When this argument is used, the dm-integrity target won't accept - integrity tags from the upper target, but it will automatically - generate and verify the integrity tags. - - You can use a crc algorithm (such as crc32), then integrity target - will protect the data against accidental corruption. - You can also use a hmac algorithm (for example - "hmac(sha256):0123456789abcdef"), in this mode it will provide - cryptographic authentication of the data without encryption. - - When this argument is not used, the integrity tags are accepted - from an upper layer target, such as dm-crypt. The upper layer - target should check the validity of the integrity tags. - -recalculate - Recalculate the integrity tags automatically. It is only valid - when using internal hash. - -journal_crypt:algorithm(:key) (the key is optional) - Encrypt the journal using given algorithm to make sure that the - attacker can't read the journal. You can use a block cipher here - (such as "cbc(aes)") or a stream cipher (for example "chacha20", - "salsa20", "ctr(aes)" or "ecb(arc4)"). - - The journal contains history of last writes to the block device, - an attacker reading the journal could see the last sector nubmers - that were written. From the sector numbers, the attacker can infer - the size of files that were written. To protect against this - situation, you can encrypt the journal. - -journal_mac:algorithm(:key) (the key is optional) - Protect sector numbers in the journal from accidental or malicious - modification. To protect against accidental modification, use a - crc algorithm, to protect against malicious modification, use a - hmac algorithm with a key. - - This option is not needed when using internal-hash because in this - mode, the integrity of journal entries is checked when replaying - the journal. Thus, modified sector number would be detected at - this stage. - -block_size:number - The size of a data block in bytes. The larger the block size the - less overhead there is for per-block integrity metadata. - Supported values are 512, 1024, 2048 and 4096 bytes. If not - specified the default block size is 512 bytes. - -sectors_per_bit:number - In the bitmap mode, this parameter specifies the number of - 512-byte sectors that corresponds to one bitmap bit. - -bitmap_flush_interval:number - The bitmap flush interval in milliseconds. The metadata buffers - are synchronized when this interval expires. - - -The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can -be changed when reloading the target (load an inactive table and swap the -tables with suspend and resume). The other arguments should not be changed -when reloading the target because the layout of disk data depend on them -and the reloaded target would be non-functional. - - -The layout of the formatted block device: -* reserved sectors (they are not used by this target, they can be used for - storing LUKS metadata or for other purpose), the size of the reserved - area is specified in the target arguments -* superblock (4kiB) - * magic string - identifies that the device was formatted - * version - * log2(interleave sectors) - * integrity tag size - * the number of journal sections - * provided data sectors - the number of sectors that this target - provides (i.e. the size of the device minus the size of all - metadata and padding). The user of this target should not send - bios that access data beyond the "provided data sectors" limit. - * flags - SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used - SB_FLAG_RECALCULATING - recalculating is in progress - SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty - blocks - * log2(sectors per block) - * a position where recalculating finished -* journal - The journal is divided into sections, each section contains: - * metadata area (4kiB), it contains journal entries - every journal entry contains: - * logical sector (specifies where the data and tag should - be written) - * last 8 bytes of data - * integrity tag (the size is specified in the superblock) - every metadata sector ends with - * mac (8-bytes), all the macs in 8 metadata sectors form a - 64-byte value. It is used to store hmac of sector - numbers in the journal section, to protect against a - possibility that the attacker tampers with sector - numbers in the journal. - * commit id - * data area (the size is variable; it depends on how many journal - entries fit into the metadata area) - every sector in the data area contains: - * data (504 bytes of data, the last 8 bytes are stored in - the journal entry) - * commit id - To test if the whole journal section was written correctly, every - 512-byte sector of the journal ends with 8-byte commit id. If the - commit id matches on all sectors in a journal section, then it is - assumed that the section was written correctly. If the commit id - doesn't match, the section was written partially and it should not - be replayed. -* one or more runs of interleaved tags and data. Each run contains: - * tag area - it contains integrity tags. There is one tag for each - sector in the data area - * data area - it contains data sectors. The number of data sectors - in one run must be a power of two. log2 of this value is stored - in the superblock. diff --git a/Documentation/device-mapper/dm-io.rst b/Documentation/device-mapper/dm-io.rst new file mode 100644 index 000000000000..d2492917a1f5 --- /dev/null +++ b/Documentation/device-mapper/dm-io.rst @@ -0,0 +1,75 @@ +===== +dm-io +===== + +Dm-io provides synchronous and asynchronous I/O services. There are three +types of I/O services available, and each type has a sync and an async +version. + +The user must set up an io_region structure to describe the desired location +of the I/O. Each io_region indicates a block-device along with the starting +sector and size of the region:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +Dm-io can read from one io_region or write to one or more io_regions. Writes +to multiple regions are specified by an array of io_region structures. + +The first I/O service type takes a list of memory pages as the data buffer for +the I/O, along with an offset into the first page:: + + struct page_list { + struct page_list *next; + struct page *page; + }; + + int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + unsigned long *error_bits); + int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + io_notify_fn fn, void *context); + +The second I/O service type takes an array of bio vectors as the data buffer +for the I/O. This service can be handy if the caller has a pre-assembled bio, +but wants to direct different portions of the bio to different devices:: + + int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + unsigned long *error_bits); + int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + io_notify_fn fn, void *context); + +The third I/O service type takes a pointer to a vmalloc'd memory buffer as the +data buffer for the I/O. This service can be handy if the caller needs to do +I/O to a large region but doesn't want to allocate a large number of individual +memory pages:: + + int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, unsigned long *error_bits); + int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, io_notify_fn fn, void *context); + +Callers of the asynchronous I/O services must include the name of a completion +callback routine and a pointer to some context data for the I/O:: + + typedef void (*io_notify_fn)(unsigned long error, void *context); + +The "error" parameter in this callback, as well as the `*error` parameter in +all of the synchronous versions, is a bitset (instead of a simple error value). +In the case of an write-I/O to multiple regions, this bitset allows dm-io to +indicate success or failure on each individual region. + +Before using any of the dm-io services, the user should call dm_io_get() +and specify the number of pages they expect to perform I/O on concurrently. +Dm-io will attempt to resize its mempool to make sure enough pages are +always available in order to avoid unnecessary waiting while performing I/O. + +When the user is finished using the dm-io services, they should call +dm_io_put() and specify the same number of pages that were given on the +dm_io_get() call. diff --git a/Documentation/device-mapper/dm-io.txt b/Documentation/device-mapper/dm-io.txt deleted file mode 100644 index 3b5d9a52cdcf..000000000000 --- a/Documentation/device-mapper/dm-io.txt +++ /dev/null @@ -1,75 +0,0 @@ -dm-io -===== - -Dm-io provides synchronous and asynchronous I/O services. There are three -types of I/O services available, and each type has a sync and an async -version. - -The user must set up an io_region structure to describe the desired location -of the I/O. Each io_region indicates a block-device along with the starting -sector and size of the region. - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -Dm-io can read from one io_region or write to one or more io_regions. Writes -to multiple regions are specified by an array of io_region structures. - -The first I/O service type takes a list of memory pages as the data buffer for -the I/O, along with an offset into the first page. - - struct page_list { - struct page_list *next; - struct page *page; - }; - - int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - unsigned long *error_bits); - int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - io_notify_fn fn, void *context); - -The second I/O service type takes an array of bio vectors as the data buffer -for the I/O. This service can be handy if the caller has a pre-assembled bio, -but wants to direct different portions of the bio to different devices. - - int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - unsigned long *error_bits); - int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - io_notify_fn fn, void *context); - -The third I/O service type takes a pointer to a vmalloc'd memory buffer as the -data buffer for the I/O. This service can be handy if the caller needs to do -I/O to a large region but doesn't want to allocate a large number of individual -memory pages. - - int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, unsigned long *error_bits); - int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, io_notify_fn fn, void *context); - -Callers of the asynchronous I/O services must include the name of a completion -callback routine and a pointer to some context data for the I/O. - - typedef void (*io_notify_fn)(unsigned long error, void *context); - -The "error" parameter in this callback, as well as the "*error" parameter in -all of the synchronous versions, is a bitset (instead of a simple error value). -In the case of an write-I/O to multiple regions, this bitset allows dm-io to -indicate success or failure on each individual region. - -Before using any of the dm-io services, the user should call dm_io_get() -and specify the number of pages they expect to perform I/O on concurrently. -Dm-io will attempt to resize its mempool to make sure enough pages are -always available in order to avoid unnecessary waiting while performing I/O. - -When the user is finished using the dm-io services, they should call -dm_io_put() and specify the same number of pages that were given on the -dm_io_get() call. - diff --git a/Documentation/device-mapper/dm-log.rst b/Documentation/device-mapper/dm-log.rst new file mode 100644 index 000000000000..ba4fce39bc27 --- /dev/null +++ b/Documentation/device-mapper/dm-log.rst @@ -0,0 +1,57 @@ +===================== +Device-Mapper Logging +===================== +The device-mapper logging code is used by some of the device-mapper +RAID targets to track regions of the disk that are not consistent. +A region (or portion of the address space) of the disk may be +inconsistent because a RAID stripe is currently being operated on or +a machine died while the region was being altered. In the case of +mirrors, a region would be considered dirty/inconsistent while you +are writing to it because the writes need to be replicated for all +the legs of the mirror and may not reach the legs at the same time. +Once all writes are complete, the region is considered clean again. + +There is a generic logging interface that the device-mapper RAID +implementations use to perform logging operations (see +dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different +logging implementations are available and provide different +capabilities. The list includes: + +============== ============================================================== +Type Files +============== ============================================================== +disk drivers/md/dm-log.c +core drivers/md/dm-log.c +userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h +============== ============================================================== + +The "disk" log type +------------------- +This log implementation commits the log state to disk. This way, the +logging state survives reboots/crashes. + +The "core" log type +------------------- +This log implementation keeps the log state in memory. The log state +will not survive a reboot or crash, but there may be a small boost in +performance. This method can also be used if no storage device is +available for storing log state. + +The "userspace" log type +------------------------ +This log type simply provides a way to export the log API to userspace, +so log implementations can be done there. This is done by forwarding most +logging requests to userspace, where a daemon receives and processes the +request. + +The structure used for communication between kernel and userspace are +located in include/linux/dm-log-userspace.h. Due to the frequency, +diversity, and 2-way communication nature of the exchanges between +kernel and userspace, 'connector' is used as the interface for +communication. + +There are currently two userspace log implementations that leverage this +framework - "clustered-disk" and "clustered-core". These implementations +provide a cluster-coherent log for shared-storage. Device-mapper mirroring +can be used in a shared-storage environment when the cluster log implementations +are employed. diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt deleted file mode 100644 index c155ac569c44..000000000000 --- a/Documentation/device-mapper/dm-log.txt +++ /dev/null @@ -1,54 +0,0 @@ -Device-Mapper Logging -===================== -The device-mapper logging code is used by some of the device-mapper -RAID targets to track regions of the disk that are not consistent. -A region (or portion of the address space) of the disk may be -inconsistent because a RAID stripe is currently being operated on or -a machine died while the region was being altered. In the case of -mirrors, a region would be considered dirty/inconsistent while you -are writing to it because the writes need to be replicated for all -the legs of the mirror and may not reach the legs at the same time. -Once all writes are complete, the region is considered clean again. - -There is a generic logging interface that the device-mapper RAID -implementations use to perform logging operations (see -dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different -logging implementations are available and provide different -capabilities. The list includes: - -Type Files -==== ===== -disk drivers/md/dm-log.c -core drivers/md/dm-log.c -userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h - -The "disk" log type -------------------- -This log implementation commits the log state to disk. This way, the -logging state survives reboots/crashes. - -The "core" log type -------------------- -This log implementation keeps the log state in memory. The log state -will not survive a reboot or crash, but there may be a small boost in -performance. This method can also be used if no storage device is -available for storing log state. - -The "userspace" log type ------------------------- -This log type simply provides a way to export the log API to userspace, -so log implementations can be done there. This is done by forwarding most -logging requests to userspace, where a daemon receives and processes the -request. - -The structure used for communication between kernel and userspace are -located in include/linux/dm-log-userspace.h. Due to the frequency, -diversity, and 2-way communication nature of the exchanges between -kernel and userspace, 'connector' is used as the interface for -communication. - -There are currently two userspace log implementations that leverage this -framework - "clustered-disk" and "clustered-core". These implementations -provide a cluster-coherent log for shared-storage. Device-mapper mirroring -can be used in a shared-storage environment when the cluster log implementations -are employed. diff --git a/Documentation/device-mapper/dm-queue-length.rst b/Documentation/device-mapper/dm-queue-length.rst new file mode 100644 index 000000000000..d8e381c1cb02 --- /dev/null +++ b/Documentation/device-mapper/dm-queue-length.rst @@ -0,0 +1,48 @@ +=============== +dm-queue-length +=============== + +dm-queue-length is a path selector module for device-mapper targets, +which selects a path with the least number of in-flight I/Os. +The path selector name is 'queue-length'. + +Table parameters for each path: [] + +:: + + : The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + +Status for each path: + +:: + + : 'A' if the path is active, 'F' if the path is failed. + : The number of path failures. + : The number of in-flight I/Os on the path. + + +Algorithm +========= + +dm-queue-length increments/decrements 'in-flight' when an I/O is +dispatched/completed respectively. +dm-queue-length selects a path with the minimum 'in-flight'. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128. + +:: + + # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt deleted file mode 100644 index f4db2562175c..000000000000 --- a/Documentation/device-mapper/dm-queue-length.txt +++ /dev/null @@ -1,39 +0,0 @@ -dm-queue-length -=============== - -dm-queue-length is a path selector module for device-mapper targets, -which selects a path with the least number of in-flight I/Os. -The path selector name is 'queue-length'. - -Table parameters for each path: [] - : The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - -Status for each path: - : 'A' if the path is active, 'F' if the path is failed. - : The number of path failures. - : The number of in-flight I/Os on the path. - - -Algorithm -========= - -dm-queue-length increments/decrements 'in-flight' when an I/O is -dispatched/completed respectively. -dm-queue-length selects a path with the minimum 'in-flight'. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128. - -# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/device-mapper/dm-raid.rst b/Documentation/device-mapper/dm-raid.rst new file mode 100644 index 000000000000..2fe255b130fb --- /dev/null +++ b/Documentation/device-mapper/dm-raid.rst @@ -0,0 +1,419 @@ +======= +dm-raid +======= + +The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. +It allows the MD RAID drivers to be accessed using a device-mapper +interface. + + +Mapping Table Interface +----------------------- +The target is named "raid" and it accepts the following parameters:: + + <#raid_params> \ + <#raid_devs> [.. ] + +: + + ============= =============================================================== + raid0 RAID0 striping (no resilience) + raid1 RAID1 mirroring + raid4 RAID4 with dedicated last parity disk + raid5_n RAID5 with dedicated last parity disk supporting takeover + Same as raid4 + + - Transitory layout + raid5_la RAID5 left asymmetric + + - rotating parity 0 with data continuation + raid5_ra RAID5 right asymmetric + + - rotating parity N with data continuation + raid5_ls RAID5 left symmetric + + - rotating parity 0 with data restart + raid5_rs RAID5 right symmetric + + - rotating parity N with data restart + raid6_zr RAID6 zero restart + + - rotating parity zero (left-to-right) with data restart + raid6_nr RAID6 N restart + + - rotating parity N (right-to-left) with data restart + raid6_nc RAID6 N continue + + - rotating parity N (right-to-left) with data continuation + raid6_n_6 RAID6 with dedicate parity disks + + - parity and Q-syndrome on the last 2 disks; + layout for takeover from/to raid4/raid5_n + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + + - layout for takeover from raid5_la from/to raid6 + raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ra from/to raid6 + raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ls from/to raid6 + raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk + + - layout for takeover from raid5_rs from/to raid6 + raid10 Various RAID10 inspired algorithms chosen by additional params + (see raid10_format and raid10_copies below) + + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') + - RAID1E: Integrated Adjacent Stripe Mirroring + - RAID1E: Integrated Offset Stripe Mirroring + - and other similar RAID10 variants + ============= =============================================================== + + Reference: Chapter 4 of + http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf + +<#raid_params>: The number of parameters that follow. + + consists of + + Mandatory parameters: + : + Chunk size in sectors. This parameter is often known as + "stripe size". It is the only mandatory parameter and + is placed first. + + followed by optional parameters (in any order): + [sync|nosync] + Force or prevent RAID initialization. + + [rebuild ] + Rebuild drive number 'idx' (first drive is 0). + + [daemon_sleep ] + Interval between runs of the bitmap daemon that + clear bits. A longer interval means less bitmap I/O but + resyncing after a failure is likely to take longer. + + [min_recovery_rate ] + Throttle RAID initialization + [max_recovery_rate ] + Throttle RAID initialization + [write_mostly ] + Mark drive index 'idx' write-mostly. + [max_write_behind ] + See '--write-behind=' (man mdadm) + [stripe_cache ] + Stripe cache size (RAID 4/5/6 only) + [region_size ] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. + + [raid10_copies <# copies>], [raid10_format ] + These two options are used to alter the default layout of + a RAID10 configuration. The number of copies is can be + specified, but the default is 2. There are also three + variations to how the copies are laid down - the default + is "near". Near copies are what most people think of with + respect to mirroring. If these options are left unspecified, + or 'raid10_copies 2' and/or 'raid10_format near' are given, + then the layouts for 2, 3 and 4 devices are: + + ======== ========== ============== + 2 drives 3 drives 4 drives + ======== ========== ============== + A1 A1 A1 A1 A2 A1 A1 A2 A2 + A2 A2 A2 A3 A3 A3 A3 A4 A4 + A3 A3 A4 A4 A5 A5 A5 A6 A6 + A4 A4 A5 A6 A6 A7 A7 A8 A8 + .. .. .. .. .. .. .. .. .. + ======== ========== ============== + + The 2-device layout is equivalent 2-way RAID1. The 4-device + layout is what a traditional RAID10 would look like. The + 3-device layout is what might be called a 'RAID1E - Integrated + Adjacent Stripe Mirroring'. + + If 'raid10_copies 2' and 'raid10_format far', then the layouts + for 2, 3 and 4 devices are: + + ======== ============ =================== + 2 drives 3 drives 4 drives + ======== ============ =================== + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + .. .. .. .. .. .. .. .. .. + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ============ =================== + + If 'raid10_copies 2' and 'raid10_format offset', then the + layouts for 2, 3 and 4 devices are: + + ======== ========== ================ + 2 drives 3 drives 4 drives + ======== ========== ================ + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ========== ================ + + Here we see layouts closely akin to 'RAID1E - Integrated + Offset Stripe Mirroring'. + + [delta_disks ] + The delta_disks option value (-251 < N < +251) triggers + device removal (negative value) or device addition (positive + value) to any reshape supporting raid levels 4/5/6 and 10. + RAID levels 4/5/6 allow for addition of devices (metadata + and data device tuple), raid10_near and raid10_offset only + allow for device addition. raid10_far does not support any + reshaping at all. + A minimum of devices have to be kept to enforce resilience, + which is 3 devices for raid4/5 and 4 devices for raid6. + + [data_offset ] + This option value defines the offset into each data device + where the data starts. This is used to provide out-of-place + reshaping space to avoid writing over data while + changing the layout of stripes, hence an interruption/crash + may happen at any time without the risk of losing data. + E.g. when adding devices to an existing raid set during + forward reshaping, the out-of-place space will be allocated + at the beginning of each raid device. The kernel raid4/5/6/10 + MD personalities supporting such device addition will read the data from + the existing first stripes (those with smaller number of stripes) + starting at data_offset to fill up a new stripe with the larger + number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) + and write that new stripe to offset 0. Same will be applied to all + N-1 other new stripes. This out-of-place scheme is used to change + the RAID type (i.e. the allocation algorithm) as well, e.g. + changing from raid5_ls to raid5_n. + + [journal_dev ] + This option adds a journal device to raid4/5/6 raid sets and + uses it to close the 'write hole' caused by the non-atomic updates + to the component devices which can cause data loss during recovery. + The journal device is used as writethrough thus causing writes to + be throttled versus non-journaled raid4/5/6 sets. + Takeover/reshape is not possible with a raid4/5/6 journal device; + it has to be deconfigured before requesting these. + + [journal_mode ] + This option sets the caching mode on journaled raid4/5/6 raid sets + (see 'journal_dev ' above) to 'writethrough' or 'writeback'. + If 'writeback' is selected the journal device has to be resilient + and must not suffer from the 'write hole' problem itself (e.g. use + raid1 or raid10) to avoid a single point of failure. + +<#raid_devs>: The number of devices composing the array. + Each device consists of two entries. The first is the device + containing the metadata (if any); the second is the one containing the + data. A Maximum of 64 metadata/data device entries are supported + up to target version 1.8.0. + 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. + + If a drive has failed or is missing at creation time, a '-' can be + given for both the metadata and data drives for a given position. + + +Example Tables +-------------- + +:: + + # RAID4 - 4 data drives, 1 parity (no metadata devices) + # No metadata devices specified to hold superblock/bitmap info + # Chunk size of 1MiB + # (Lines separated for easy reading) + + 0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + + # RAID4 - 4 data drives, 1 parity (with metadata devices) + # Chunk size of 1MiB, force RAID initialization, + # min recovery rate at 20 kiB/sec/disk + + 0 1960893648 raid \ + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 + + +Status Output +------------- +'dmsetup table' displays the table used to construct the mapping. +The optional parameters are always printed in the order listed +above with "sync" or "nosync" always output ahead of the other +arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. + + +'dmsetup status' yields information on the state and health of the array. +The output is as follows (normally a single line, but expanded here for +clarity):: + + 1: raid \ + 2: <#devices> \ + 3: + +Line 1 is the standard output produced by device-mapper. + +Line 2 & 3 are produced by the raid target and are best explained by example:: + + 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 + +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with its initial +recovery. Here is a fuller description of the individual fields: + + =============== ========================================================= + Same as the used to create the array. + One char for each device, indicating: + + - 'A' = alive and in-sync + - 'a' = alive but not in-sync + - 'D' = dead/failed. + The ratio indicating how much of the array has undergone + the process described by 'sync_action'. If the + 'sync_action' is "check" or "repair", then the process + of "resync" or "recover" can be considered complete. + One of the following possible states: + + idle + - No synchronization action is being performed. + frozen + - The current action has been halted. + resync + - Array is undergoing its initial synchronization + or is resynchronizing after an unclean shutdown + (possibly aided by a bitmap). + recover + - A device in the array is being rebuilt or + replaced. + check + - A user-initiated full check of the array is + being performed. All blocks are read and + checked for consistency. The number of + discrepancies found are recorded in + . No changes are made to the + array by this action. + repair + - The same as "check", but discrepancies are + corrected. + reshape + - The array is undergoing a reshape. + The number of discrepancies found between mirror copies + in RAID1/10 or wrong parity values found in RAID4/5/6. + This value is valid only after a "check" of the array + is performed. A healthy array has a 'mismatch_cnt' of 0. + The current data offset to the start of the user data on + each component device of a raid set (see the respective + raid parameter to support out-of-place reshaping). + - 'A' - active write-through journal device. + - 'a' - active write-back journal device. + - 'D' - dead journal device. + - '-' - no journal device. + =============== ========================================================= + + +Message Interface +----------------- +The dm-raid target will accept certain actions through the 'message' interface. +('man dmsetup' for more information on the message interface.) These actions +include: + + ========= ================================================ + "idle" Halt the current sync action. + "frozen" Freeze the current sync action. + "resync" Initiate/continue a resync. + "recover" Initiate/continue a recover process. + "check" Initiate a check (i.e. a "scrub") of the array. + "repair" Initiate a repair of the array. + ========= ================================================ + + +Discard Support +--------------- +The implementation of discard support among hardware vendors varies. +When a block is discarded, some storage devices will return zeroes when +the block is read. These devices set the 'discard_zeroes_data' +attribute. Other devices will return random data. Confusingly, some +devices that advertise 'discard_zeroes_data' will not reliably return +zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks +from a number of devices to calculate parity blocks and (for performance +reasons) relies on 'discard_zeroes_data' being reliable, it is important +that the devices be consistent. Blocks may be discarded in the middle +of a RAID 4/5/6 stripe and if subsequent read results are not +consistent, the parity blocks may be calculated differently at any time; +making the parity blocks useless for redundancy. It is important to +understand how your hardware behaves with discards if you are going to +enable discards with RAID 4/5/6. + +Since the behavior of storage devices is unreliable in this respect, +even when reporting 'discard_zeroes_data', by default RAID 4/5/6 +discard support is disabled -- this ensures data integrity at the +expense of losing some performance. + +Storage devices that properly support 'discard_zeroes_data' are +increasingly whitelisted in the kernel and can thus be trusted. + +For trusted devices, the following dm-raid module parameter can be set +to safely enable discard support for RAID 4/5/6: + + 'devices_handle_discards_safely' + + +Version History +--------------- + +:: + + 1.0.0 Initial version. Support for RAID 4/5/6 + 1.1.0 Added support for RAID 1 + 1.2.0 Handle creation of arrays that contain failed devices. + 1.3.0 Added support for RAID 10 + 1.3.1 Allow device replacement/rebuild for RAID 10 + 1.3.2 Fix/improve redundancy checking for RAID10 + 1.4.0 Non-functional change. Removes arg from mapping function. + 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). + 1.4.2 Add RAID10 "far" and "offset" algorithm support. + 1.5.0 Add message interface to allow manipulation of the sync_action. + New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. + 1.5.1 Add ability to restore transiently failed devices on resume. + 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". + 1.6.0 Add discard support (and devices_handle_discard_safely module param). + 1.7.0 Add support for MD RAID0 mappings. + 1.8.0 Explicitly check for compatible flags in the superblock metadata + and reject to start the raid set if any are set by a newer + target version, thus avoiding data corruption on a raid set + with a reshape in progress. + 1.9.0 Add support for RAID level takeover/reshape/region size + and set size reduction. + 1.9.1 Fix activation of existing RAID 4/10 mapped devices + 1.9.2 Don't emit '- -' on the status table line in case the constructor + fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and + 'D' on the status line. If '- -' is passed into the constructor, emit + '- -' on the table line and '-' as the status line health character. + 1.10.0 Add support for raid4/5/6 journal device + 1.10.1 Fix data corruption on reshape request + 1.11.0 Fix table line argument order + (wrong raid10_copies/raid10_format sequence) + 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option + 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available + 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') + 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an + state races. + 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen + 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape + deadlock/potential data corruption. Update superblock when + specific devices are requested via rebuild. Fix RAID leg + rebuild errors. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt deleted file mode 100644 index 2355bef14653..000000000000 --- a/Documentation/device-mapper/dm-raid.txt +++ /dev/null @@ -1,354 +0,0 @@ -dm-raid -======= - -The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. -It allows the MD RAID drivers to be accessed using a device-mapper -interface. - - -Mapping Table Interface ------------------------ -The target is named "raid" and it accepts the following parameters: - - <#raid_params> \ - <#raid_devs> [.. ] - -: - raid0 RAID0 striping (no resilience) - raid1 RAID1 mirroring - raid4 RAID4 with dedicated last parity disk - raid5_n RAID5 with dedicated last parity disk supporting takeover - Same as raid4 - -Transitory layout - raid5_la RAID5 left asymmetric - - rotating parity 0 with data continuation - raid5_ra RAID5 right asymmetric - - rotating parity N with data continuation - raid5_ls RAID5 left symmetric - - rotating parity 0 with data restart - raid5_rs RAID5 right symmetric - - rotating parity N with data restart - raid6_zr RAID6 zero restart - - rotating parity zero (left-to-right) with data restart - raid6_nr RAID6 N restart - - rotating parity N (right-to-left) with data restart - raid6_nc RAID6 N continue - - rotating parity N (right-to-left) with data continuation - raid6_n_6 RAID6 with dedicate parity disks - - parity and Q-syndrome on the last 2 disks; - layout for takeover from/to raid4/raid5_n - raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk - - layout for takeover from raid5_la from/to raid6 - raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk - - layout for takeover from raid5_ra from/to raid6 - raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk - - layout for takeover from raid5_ls from/to raid6 - raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk - - layout for takeover from raid5_rs from/to raid6 - raid10 Various RAID10 inspired algorithms chosen by additional params - (see raid10_format and raid10_copies below) - - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - - RAID1E: Integrated Adjacent Stripe Mirroring - - RAID1E: Integrated Offset Stripe Mirroring - - and other similar RAID10 variants - - Reference: Chapter 4 of - http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf - -<#raid_params>: The number of parameters that follow. - - consists of - Mandatory parameters: - : Chunk size in sectors. This parameter is often known as - "stripe size". It is the only mandatory parameter and - is placed first. - - followed by optional parameters (in any order): - [sync|nosync] Force or prevent RAID initialization. - - [rebuild ] Rebuild drive number 'idx' (first drive is 0). - - [daemon_sleep ] - Interval between runs of the bitmap daemon that - clear bits. A longer interval means less bitmap I/O but - resyncing after a failure is likely to take longer. - - [min_recovery_rate ] Throttle RAID initialization - [max_recovery_rate ] Throttle RAID initialization - [write_mostly ] Mark drive index 'idx' write-mostly. - [max_write_behind ] See '--write-behind=' (man mdadm) - [stripe_cache ] Stripe cache size (RAID 4/5/6 only) - [region_size ] - The region_size multiplied by the number of regions is the - logical size of the array. The bitmap records the device - synchronisation state for each region. - - [raid10_copies <# copies>] - [raid10_format ] - These two options are used to alter the default layout of - a RAID10 configuration. The number of copies is can be - specified, but the default is 2. There are also three - variations to how the copies are laid down - the default - is "near". Near copies are what most people think of with - respect to mirroring. If these options are left unspecified, - or 'raid10_copies 2' and/or 'raid10_format near' are given, - then the layouts for 2, 3 and 4 devices are: - 2 drives 3 drives 4 drives - -------- ---------- -------------- - A1 A1 A1 A1 A2 A1 A1 A2 A2 - A2 A2 A2 A3 A3 A3 A3 A4 A4 - A3 A3 A4 A4 A5 A5 A5 A6 A6 - A4 A4 A5 A6 A6 A7 A7 A8 A8 - .. .. .. .. .. .. .. .. .. - The 2-device layout is equivalent 2-way RAID1. The 4-device - layout is what a traditional RAID10 would look like. The - 3-device layout is what might be called a 'RAID1E - Integrated - Adjacent Stripe Mirroring'. - - If 'raid10_copies 2' and 'raid10_format far', then the layouts - for 2, 3 and 4 devices are: - 2 drives 3 drives 4 drives - -------- -------------- -------------------- - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - .. .. .. .. .. .. .. .. .. - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - - If 'raid10_copies 2' and 'raid10_format offset', then the - layouts for 2, 3 and 4 devices are: - 2 drives 3 drives 4 drives - -------- ------------ ----------------- - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - Here we see layouts closely akin to 'RAID1E - Integrated - Offset Stripe Mirroring'. - - [delta_disks ] - The delta_disks option value (-251 < N < +251) triggers - device removal (negative value) or device addition (positive - value) to any reshape supporting raid levels 4/5/6 and 10. - RAID levels 4/5/6 allow for addition of devices (metadata - and data device tuple), raid10_near and raid10_offset only - allow for device addition. raid10_far does not support any - reshaping at all. - A minimum of devices have to be kept to enforce resilience, - which is 3 devices for raid4/5 and 4 devices for raid6. - - [data_offset ] - This option value defines the offset into each data device - where the data starts. This is used to provide out-of-place - reshaping space to avoid writing over data while - changing the layout of stripes, hence an interruption/crash - may happen at any time without the risk of losing data. - E.g. when adding devices to an existing raid set during - forward reshaping, the out-of-place space will be allocated - at the beginning of each raid device. The kernel raid4/5/6/10 - MD personalities supporting such device addition will read the data from - the existing first stripes (those with smaller number of stripes) - starting at data_offset to fill up a new stripe with the larger - number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) - and write that new stripe to offset 0. Same will be applied to all - N-1 other new stripes. This out-of-place scheme is used to change - the RAID type (i.e. the allocation algorithm) as well, e.g. - changing from raid5_ls to raid5_n. - - [journal_dev ] - This option adds a journal device to raid4/5/6 raid sets and - uses it to close the 'write hole' caused by the non-atomic updates - to the component devices which can cause data loss during recovery. - The journal device is used as writethrough thus causing writes to - be throttled versus non-journaled raid4/5/6 sets. - Takeover/reshape is not possible with a raid4/5/6 journal device; - it has to be deconfigured before requesting these. - - [journal_mode ] - This option sets the caching mode on journaled raid4/5/6 raid sets - (see 'journal_dev ' above) to 'writethrough' or 'writeback'. - If 'writeback' is selected the journal device has to be resilient - and must not suffer from the 'write hole' problem itself (e.g. use - raid1 or raid10) to avoid a single point of failure. - -<#raid_devs>: The number of devices composing the array. - Each device consists of two entries. The first is the device - containing the metadata (if any); the second is the one containing the - data. A Maximum of 64 metadata/data device entries are supported - up to target version 1.8.0. - 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. - - If a drive has failed or is missing at creation time, a '-' can be - given for both the metadata and data drives for a given position. - - -Example Tables --------------- -# RAID4 - 4 data drives, 1 parity (no metadata devices) -# No metadata devices specified to hold superblock/bitmap info -# Chunk size of 1MiB -# (Lines separated for easy reading) - -0 1960893648 raid \ - raid4 1 2048 \ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 - -# RAID4 - 4 data drives, 1 parity (with metadata devices) -# Chunk size of 1MiB, force RAID initialization, -# min recovery rate at 20 kiB/sec/disk - -0 1960893648 raid \ - raid4 4 2048 sync min_recovery_rate 20 \ - 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 - - -Status Output -------------- -'dmsetup table' displays the table used to construct the mapping. -The optional parameters are always printed in the order listed -above with "sync" or "nosync" always output ahead of the other -arguments, regardless of the order used when originally loading the table. -Arguments that can be repeated are ordered by value. - - -'dmsetup status' yields information on the state and health of the array. -The output is as follows (normally a single line, but expanded here for -clarity): -1: raid \ -2: <#devices> \ -3: - -Line 1 is the standard output produced by device-mapper. -Line 2 & 3 are produced by the raid target and are best explained by example: - 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 -Here we can see the RAID type is raid4, there are 5 devices - all of -which are 'A'live, and the array is 2/490221568 complete with its initial -recovery. Here is a fuller description of the individual fields: - Same as the used to create the array. - One char for each device, indicating: 'A' = alive and - in-sync, 'a' = alive but not in-sync, 'D' = dead/failed. - The ratio indicating how much of the array has undergone - the process described by 'sync_action'. If the - 'sync_action' is "check" or "repair", then the process - of "resync" or "recover" can be considered complete. - One of the following possible states: - idle - No synchronization action is being performed. - frozen - The current action has been halted. - resync - Array is undergoing its initial synchronization - or is resynchronizing after an unclean shutdown - (possibly aided by a bitmap). - recover - A device in the array is being rebuilt or - replaced. - check - A user-initiated full check of the array is - being performed. All blocks are read and - checked for consistency. The number of - discrepancies found are recorded in - . No changes are made to the - array by this action. - repair - The same as "check", but discrepancies are - corrected. - reshape - The array is undergoing a reshape. - The number of discrepancies found between mirror copies - in RAID1/10 or wrong parity values found in RAID4/5/6. - This value is valid only after a "check" of the array - is performed. A healthy array has a 'mismatch_cnt' of 0. - The current data offset to the start of the user data on - each component device of a raid set (see the respective - raid parameter to support out-of-place reshaping). - 'A' - active write-through journal device. - 'a' - active write-back journal device. - 'D' - dead journal device. - '-' - no journal device. - - -Message Interface ------------------ -The dm-raid target will accept certain actions through the 'message' interface. -('man dmsetup' for more information on the message interface.) These actions -include: - "idle" - Halt the current sync action. - "frozen" - Freeze the current sync action. - "resync" - Initiate/continue a resync. - "recover"- Initiate/continue a recover process. - "check" - Initiate a check (i.e. a "scrub") of the array. - "repair" - Initiate a repair of the array. - - -Discard Support ---------------- -The implementation of discard support among hardware vendors varies. -When a block is discarded, some storage devices will return zeroes when -the block is read. These devices set the 'discard_zeroes_data' -attribute. Other devices will return random data. Confusingly, some -devices that advertise 'discard_zeroes_data' will not reliably return -zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks -from a number of devices to calculate parity blocks and (for performance -reasons) relies on 'discard_zeroes_data' being reliable, it is important -that the devices be consistent. Blocks may be discarded in the middle -of a RAID 4/5/6 stripe and if subsequent read results are not -consistent, the parity blocks may be calculated differently at any time; -making the parity blocks useless for redundancy. It is important to -understand how your hardware behaves with discards if you are going to -enable discards with RAID 4/5/6. - -Since the behavior of storage devices is unreliable in this respect, -even when reporting 'discard_zeroes_data', by default RAID 4/5/6 -discard support is disabled -- this ensures data integrity at the -expense of losing some performance. - -Storage devices that properly support 'discard_zeroes_data' are -increasingly whitelisted in the kernel and can thus be trusted. - -For trusted devices, the following dm-raid module parameter can be set -to safely enable discard support for RAID 4/5/6: - 'devices_handle_discards_safely' - - -Version History ---------------- -1.0.0 Initial version. Support for RAID 4/5/6 -1.1.0 Added support for RAID 1 -1.2.0 Handle creation of arrays that contain failed devices. -1.3.0 Added support for RAID 10 -1.3.1 Allow device replacement/rebuild for RAID 10 -1.3.2 Fix/improve redundancy checking for RAID10 -1.4.0 Non-functional change. Removes arg from mapping function. -1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). -1.4.2 Add RAID10 "far" and "offset" algorithm support. -1.5.0 Add message interface to allow manipulation of the sync_action. - New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. -1.5.1 Add ability to restore transiently failed devices on resume. -1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". -1.6.0 Add discard support (and devices_handle_discard_safely module param). -1.7.0 Add support for MD RAID0 mappings. -1.8.0 Explicitly check for compatible flags in the superblock metadata - and reject to start the raid set if any are set by a newer - target version, thus avoiding data corruption on a raid set - with a reshape in progress. -1.9.0 Add support for RAID level takeover/reshape/region size - and set size reduction. -1.9.1 Fix activation of existing RAID 4/10 mapped devices -1.9.2 Don't emit '- -' on the status table line in case the constructor - fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and - 'D' on the status line. If '- -' is passed into the constructor, emit - '- -' on the table line and '-' as the status line health character. -1.10.0 Add support for raid4/5/6 journal device -1.10.1 Fix data corruption on reshape request -1.11.0 Fix table line argument order - (wrong raid10_copies/raid10_format sequence) -1.11.1 Add raid4/5/6 journal write-back support via journal_mode option -1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available -1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') -1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an - state races. -1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen -1.14.0 Fix reshape race on small devices. Fix stripe adding reshape - deadlock/potential data corruption. Update superblock when - specific devices are requested via rebuild. Fix RAID leg - rebuild errors. diff --git a/Documentation/device-mapper/dm-service-time.rst b/Documentation/device-mapper/dm-service-time.rst new file mode 100644 index 000000000000..facf277fc13c --- /dev/null +++ b/Documentation/device-mapper/dm-service-time.rst @@ -0,0 +1,101 @@ +=============== +dm-service-time +=============== + +dm-service-time is a path selector module for device-mapper targets, +which selects a path with the shortest estimated service time for +the incoming I/O. + +The service time for each path is estimated by dividing the total size +of in-flight I/Os on a path with the performance value of the path. +The performance value is a relative throughput value among all paths +in a path-group, and it can be specified as a table argument. + +The path selector name is 'service-time'. + +Table parameters for each path: + + [ []] + : + The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + : + The relative throughput value of the path + among all paths in the path-group. + The valid range is 0-100. + If not given, minimum value '1' is used. + If '0' is given, the path isn't selected while + other paths having a positive value are available. + +Status for each path: + + + : + 'A' if the path is active, 'F' if the path is failed. + : + The number of path failures. + : + The size of in-flight I/Os on the path. + : + The relative throughput value of the path + among all paths in the path-group. + + +Algorithm +========= + +dm-service-time adds the I/O size to 'in-flight-size' when the I/O is +dispatched and subtracts when completed. +Basically, dm-service-time selects a path having minimum service time +which is calculated by:: + + ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' + +However, some optimizations below are used to reduce the calculation +as much as possible. + + 1. If the paths have the same 'relative_throughput', skip + the division and just compare the 'in-flight-size'. + + 2. If the paths have the same 'in-flight-size', skip the division + and just compare the 'relative_throughput'. + + 3. If some paths have non-zero 'relative_throughput' and others + have zero 'relative_throughput', ignore those paths with zero + 'relative_throughput'. + +If such optimizations can't be applied, calculate service time, and +compare service time. +If calculated service time is equal, the path having maximum +'relative_throughput' may be better. So compare 'relative_throughput' +then. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128 +and sda has an average throughput 1GB/s and sdb has 4GB/s, +'relative_throughput' value may be '1' for sda and '4' for sdb:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 + + +Or '2' for sda and '8' for sdb would be also true:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt deleted file mode 100644 index fb1d4a0cf122..000000000000 --- a/Documentation/device-mapper/dm-service-time.txt +++ /dev/null @@ -1,91 +0,0 @@ -dm-service-time -=============== - -dm-service-time is a path selector module for device-mapper targets, -which selects a path with the shortest estimated service time for -the incoming I/O. - -The service time for each path is estimated by dividing the total size -of in-flight I/Os on a path with the performance value of the path. -The performance value is a relative throughput value among all paths -in a path-group, and it can be specified as a table argument. - -The path selector name is 'service-time'. - -Table parameters for each path: [ []] - : The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - : The relative throughput value of the path - among all paths in the path-group. - The valid range is 0-100. - If not given, minimum value '1' is used. - If '0' is given, the path isn't selected while - other paths having a positive value are available. - -Status for each path: \ - - : 'A' if the path is active, 'F' if the path is failed. - : The number of path failures. - : The size of in-flight I/Os on the path. - : The relative throughput value of the path - among all paths in the path-group. - - -Algorithm -========= - -dm-service-time adds the I/O size to 'in-flight-size' when the I/O is -dispatched and subtracts when completed. -Basically, dm-service-time selects a path having minimum service time -which is calculated by: - - ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' - -However, some optimizations below are used to reduce the calculation -as much as possible. - - 1. If the paths have the same 'relative_throughput', skip - the division and just compare the 'in-flight-size'. - - 2. If the paths have the same 'in-flight-size', skip the division - and just compare the 'relative_throughput'. - - 3. If some paths have non-zero 'relative_throughput' and others - have zero 'relative_throughput', ignore those paths with zero - 'relative_throughput'. - -If such optimizations can't be applied, calculate service time, and -compare service time. -If calculated service time is equal, the path having maximum -'relative_throughput' may be better. So compare 'relative_throughput' -then. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128 -and sda has an average throughput 1GB/s and sdb has 4GB/s, -'relative_throughput' value may be '1' for sda and '4' for sdb. - -# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 - - -Or '2' for sda and '8' for sdb would be also true. - -# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/device-mapper/dm-uevent.rst b/Documentation/device-mapper/dm-uevent.rst new file mode 100644 index 000000000000..4a8ee8d069c9 --- /dev/null +++ b/Documentation/device-mapper/dm-uevent.rst @@ -0,0 +1,110 @@ +==================== +device-mapper uevent +==================== + +The device-mapper uevent code adds the capability to device-mapper to create +and send kobject uevents (uevents). Previously device-mapper events were only +available through the ioctl interface. The advantage of the uevents interface +is the event contains environment attributes providing increased context for +the event avoiding the need to query the state of the device-mapper device after +the event is received. + +There are two functions currently for device-mapper events. The first function +listed creates the event and the second function sends the event(s):: + + void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) + + void dm_send_uevents(struct list_head *events, struct kobject *kobj) + + +The variables added to the uevent environment are: + +Variable Name: DM_TARGET +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Name of device-mapper target that generated the event. + +Variable Name: DM_ACTION +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Device-mapper specific action that caused the uevent action. + PATH_FAILED - A path has failed; + PATH_REINSTATED - A path has been reinstated. + +Variable Name: DM_SEQNUM +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: A sequence number for this specific device-mapper device. +:Value: Valid unsigned integer range. + +Variable Name: DM_PATH +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Major and minor number of the path device pertaining to this + event. +:Value: Path name in the form of "Major:Minor" + +Variable Name: DM_NR_VALID_PATHS +-------------------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: +:Value: Valid unsigned integer range. + +Variable Name: DM_NAME +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Name of the device-mapper device. +:Value: Name + +Variable Name: DM_UUID +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: UUID of the device-mapper device. +:Value: UUID. (Empty string if there isn't one.) + +An example of the uevents generated as captured by udevmonitor is shown +below + +1.) Path failure:: + + UEVENT[1192521009.711215] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_FAILED + DM_SEQNUM=1 + DM_PATH=8:32 + DM_NR_VALID_PATHS=0 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1130 + +2.) Path reinstate:: + + UEVENT[1192521132.989927] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_REINSTATED + DM_SEQNUM=2 + DM_PATH=8:32 + DM_NR_VALID_PATHS=1 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt deleted file mode 100644 index 07edbd85c714..000000000000 --- a/Documentation/device-mapper/dm-uevent.txt +++ /dev/null @@ -1,97 +0,0 @@ -The device-mapper uevent code adds the capability to device-mapper to create -and send kobject uevents (uevents). Previously device-mapper events were only -available through the ioctl interface. The advantage of the uevents interface -is the event contains environment attributes providing increased context for -the event avoiding the need to query the state of the device-mapper device after -the event is received. - -There are two functions currently for device-mapper events. The first function -listed creates the event and the second function sends the event(s). - -void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, - const char *path, unsigned nr_valid_paths) - -void dm_send_uevents(struct list_head *events, struct kobject *kobj) - - -The variables added to the uevent environment are: - -Variable Name: DM_TARGET -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: -Value: Name of device-mapper target that generated the event. - -Variable Name: DM_ACTION -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: -Value: Device-mapper specific action that caused the uevent action. - PATH_FAILED - A path has failed. - PATH_REINSTATED - A path has been reinstated. - -Variable Name: DM_SEQNUM -Uevent Action(s): KOBJ_CHANGE -Type: unsigned integer -Description: A sequence number for this specific device-mapper device. -Value: Valid unsigned integer range. - -Variable Name: DM_PATH -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: Major and minor number of the path device pertaining to this -event. -Value: Path name in the form of "Major:Minor" - -Variable Name: DM_NR_VALID_PATHS -Uevent Action(s): KOBJ_CHANGE -Type: unsigned integer -Description: -Value: Valid unsigned integer range. - -Variable Name: DM_NAME -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: Name of the device-mapper device. -Value: Name - -Variable Name: DM_UUID -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: UUID of the device-mapper device. -Value: UUID. (Empty string if there isn't one.) - -An example of the uevents generated as captured by udevmonitor is shown -below. - -1.) Path failure. -UEVENT[1192521009.711215] change@/block/dm-3 -ACTION=change -DEVPATH=/block/dm-3 -SUBSYSTEM=block -DM_TARGET=multipath -DM_ACTION=PATH_FAILED -DM_SEQNUM=1 -DM_PATH=8:32 -DM_NR_VALID_PATHS=0 -DM_NAME=mpath2 -DM_UUID=mpath-35333333000002328 -MINOR=3 -MAJOR=253 -SEQNUM=1130 - -2.) Path reinstate. -UEVENT[1192521132.989927] change@/block/dm-3 -ACTION=change -DEVPATH=/block/dm-3 -SUBSYSTEM=block -DM_TARGET=multipath -DM_ACTION=PATH_REINSTATED -DM_SEQNUM=2 -DM_PATH=8:32 -DM_NR_VALID_PATHS=1 -DM_NAME=mpath2 -DM_UUID=mpath-35333333000002328 -MINOR=3 -MAJOR=253 -SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-zoned.rst b/Documentation/device-mapper/dm-zoned.rst new file mode 100644 index 000000000000..07f56ebc1730 --- /dev/null +++ b/Documentation/device-mapper/dm-zoned.rst @@ -0,0 +1,146 @@ +======== +dm-zoned +======== + +The dm-zoned device mapper target exposes a zoned block device (ZBC and +ZAC compliant devices) as a regular block device without any write +pattern constraints. In effect, it implements a drive-managed zoned +block device which hides from the user (a file system or an application +doing raw block device accesses) the sequential write constraints of +host-managed zoned block devices and can mitigate the potential +device-side performance degradation due to excessive random writes on +host-aware zoned block devices. + +For a more detailed description of the zoned block device models and +their constraints see (for SCSI devices): + +http://www.t10.org/drafts.htm#ZBC_Family + +and (for ATA devices): + +http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf + +The dm-zoned implementation is simple and minimizes system overhead (CPU +and memory usage as well as storage capacity loss). For a 10TB +host-managed disk with 256 MB zones, dm-zoned memory usage per disk +instance is at most 4.5 MB and as little as 5 zones will be used +internally for storing metadata and performaing reclaim operations. + +dm-zoned target devices are formatted and checked using the dmzadm +utility available at: + +https://github.com/hgst/dm-zoned-tools + +Algorithm +========= + +dm-zoned implements an on-disk buffering scheme to handle non-sequential +write accesses to the sequential zones of a zoned block device. +Conventional zones are used for caching as well as for storing internal +metadata. + +The zones of the device are separated into 2 types: + +1) Metadata zones: these are conventional zones used to store metadata. +Metadata zones are not reported as useable capacity to the user. + +2) Data zones: all remaining zones, the vast majority of which will be +sequential zones used exclusively to store user data. The conventional +zones of the device may be used also for buffering user random writes. +Data in these zones may be directly mapped to the conventional zone, but +later moved to a sequential zone so that the conventional zone can be +reused for buffering incoming random writes. + +dm-zoned exposes a logical device with a sector size of 4096 bytes, +irrespective of the physical sector size of the backend zoned block +device being used. This allows reducing the amount of metadata needed to +manage valid blocks (blocks written). + +The on-disk metadata format is as follows: + +1) The first block of the first conventional zone found contains the +super block which describes the on disk amount and position of metadata +blocks. + +2) Following the super block, a set of blocks is used to describe the +mapping of the logical device blocks. The mapping is done per chunk of +blocks, with the chunk size equal to the zoned block device size. The +mapping table is indexed by chunk number and each mapping entry +indicates the zone number of the device storing the chunk of data. Each +mapping entry may also indicate if the zone number of a conventional +zone used to buffer random modification to the data zone. + +3) A set of blocks used to store bitmaps indicating the validity of +blocks in the data zones follows the mapping table. A valid block is +defined as a block that was written and not discarded. For a buffered +data chunk, a block is always valid only in the data zone mapping the +chunk or in the buffer zone of the chunk. + +For a logical chunk mapped to a conventional zone, all write operations +are processed by directly writing to the zone. If the mapping zone is a +sequential zone, the write operation is processed directly only if the +write offset within the logical chunk is equal to the write pointer +offset within of the sequential data zone (i.e. the write operation is +aligned on the zone write pointer). Otherwise, write operations are +processed indirectly using a buffer zone. In that case, an unused +conventional zone is allocated and assigned to the chunk being +accessed. Writing a block to the buffer zone of a chunk will +automatically invalidate the same block in the sequential zone mapping +the chunk. If all blocks of the sequential zone become invalid, the zone +is freed and the chunk buffer zone becomes the primary zone mapping the +chunk, resulting in native random write performance similar to a regular +block device. + +Read operations are processed according to the block validity +information provided by the bitmaps. Valid blocks are read either from +the sequential zone mapping a chunk, or if the chunk is buffered, from +the buffer zone assigned. If the accessed chunk has no mapping, or the +accessed blocks are invalid, the read buffer is zeroed and the read +operation terminated. + +After some time, the limited number of convnetional zones available may +be exhausted (all used to map chunks or buffer sequential zones) and +unaligned writes to unbuffered chunks become impossible. To avoid this +situation, a reclaim process regularly scans used conventional zones and +tries to reclaim the least recently used zones by copying the valid +blocks of the buffer zone to a free sequential zone. Once the copy +completes, the chunk mapping is updated to point to the sequential zone +and the buffer zone freed for reuse. + +Metadata Protection +=================== + +To protect metadata against corruption in case of sudden power loss or +system crash, 2 sets of metadata zones are used. One set, the primary +set, is used as the main metadata region, while the secondary set is +used as a staging area. Modified metadata is first written to the +secondary set and validated by updating the super block in the secondary +set, a generation counter is used to indicate that this set contains the +newest metadata. Once this operation completes, in place of metadata +block updates can be done in the primary metadata set. This ensures that +one of the set is always consistent (all modifications committed or none +at all). Flush operations are used as a commit point. Upon reception of +a flush request, metadata modification activity is temporarily blocked +(for both incoming BIO processing and reclaim process) and all dirty +metadata blocks are staged and updated. Normal operation is then +resumed. Flushing metadata thus only temporarily delays write and +discard requests. Read requests can be processed concurrently while +metadata flush is being executed. + +Usage +===== + +A zoned block device must first be formatted using the dmzadm tool. This +will analyze the device zone configuration, determine where to place the +metadata sets on the device and initialize the metadata sets. + +Ex:: + + dmzadm --format /dev/sdxx + +For a formatted device, the target can be created normally with the +dmsetup utility. The only parameter that dm-zoned requires is the +underlying zoned block device name. Ex:: + + echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ + dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/device-mapper/dm-zoned.txt b/Documentation/device-mapper/dm-zoned.txt deleted file mode 100644 index 736fcc78d193..000000000000 --- a/Documentation/device-mapper/dm-zoned.txt +++ /dev/null @@ -1,144 +0,0 @@ -dm-zoned -======== - -The dm-zoned device mapper target exposes a zoned block device (ZBC and -ZAC compliant devices) as a regular block device without any write -pattern constraints. In effect, it implements a drive-managed zoned -block device which hides from the user (a file system or an application -doing raw block device accesses) the sequential write constraints of -host-managed zoned block devices and can mitigate the potential -device-side performance degradation due to excessive random writes on -host-aware zoned block devices. - -For a more detailed description of the zoned block device models and -their constraints see (for SCSI devices): - -http://www.t10.org/drafts.htm#ZBC_Family - -and (for ATA devices): - -http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf - -The dm-zoned implementation is simple and minimizes system overhead (CPU -and memory usage as well as storage capacity loss). For a 10TB -host-managed disk with 256 MB zones, dm-zoned memory usage per disk -instance is at most 4.5 MB and as little as 5 zones will be used -internally for storing metadata and performaing reclaim operations. - -dm-zoned target devices are formatted and checked using the dmzadm -utility available at: - -https://github.com/hgst/dm-zoned-tools - -Algorithm -========= - -dm-zoned implements an on-disk buffering scheme to handle non-sequential -write accesses to the sequential zones of a zoned block device. -Conventional zones are used for caching as well as for storing internal -metadata. - -The zones of the device are separated into 2 types: - -1) Metadata zones: these are conventional zones used to store metadata. -Metadata zones are not reported as useable capacity to the user. - -2) Data zones: all remaining zones, the vast majority of which will be -sequential zones used exclusively to store user data. The conventional -zones of the device may be used also for buffering user random writes. -Data in these zones may be directly mapped to the conventional zone, but -later moved to a sequential zone so that the conventional zone can be -reused for buffering incoming random writes. - -dm-zoned exposes a logical device with a sector size of 4096 bytes, -irrespective of the physical sector size of the backend zoned block -device being used. This allows reducing the amount of metadata needed to -manage valid blocks (blocks written). - -The on-disk metadata format is as follows: - -1) The first block of the first conventional zone found contains the -super block which describes the on disk amount and position of metadata -blocks. - -2) Following the super block, a set of blocks is used to describe the -mapping of the logical device blocks. The mapping is done per chunk of -blocks, with the chunk size equal to the zoned block device size. The -mapping table is indexed by chunk number and each mapping entry -indicates the zone number of the device storing the chunk of data. Each -mapping entry may also indicate if the zone number of a conventional -zone used to buffer random modification to the data zone. - -3) A set of blocks used to store bitmaps indicating the validity of -blocks in the data zones follows the mapping table. A valid block is -defined as a block that was written and not discarded. For a buffered -data chunk, a block is always valid only in the data zone mapping the -chunk or in the buffer zone of the chunk. - -For a logical chunk mapped to a conventional zone, all write operations -are processed by directly writing to the zone. If the mapping zone is a -sequential zone, the write operation is processed directly only if the -write offset within the logical chunk is equal to the write pointer -offset within of the sequential data zone (i.e. the write operation is -aligned on the zone write pointer). Otherwise, write operations are -processed indirectly using a buffer zone. In that case, an unused -conventional zone is allocated and assigned to the chunk being -accessed. Writing a block to the buffer zone of a chunk will -automatically invalidate the same block in the sequential zone mapping -the chunk. If all blocks of the sequential zone become invalid, the zone -is freed and the chunk buffer zone becomes the primary zone mapping the -chunk, resulting in native random write performance similar to a regular -block device. - -Read operations are processed according to the block validity -information provided by the bitmaps. Valid blocks are read either from -the sequential zone mapping a chunk, or if the chunk is buffered, from -the buffer zone assigned. If the accessed chunk has no mapping, or the -accessed blocks are invalid, the read buffer is zeroed and the read -operation terminated. - -After some time, the limited number of convnetional zones available may -be exhausted (all used to map chunks or buffer sequential zones) and -unaligned writes to unbuffered chunks become impossible. To avoid this -situation, a reclaim process regularly scans used conventional zones and -tries to reclaim the least recently used zones by copying the valid -blocks of the buffer zone to a free sequential zone. Once the copy -completes, the chunk mapping is updated to point to the sequential zone -and the buffer zone freed for reuse. - -Metadata Protection -=================== - -To protect metadata against corruption in case of sudden power loss or -system crash, 2 sets of metadata zones are used. One set, the primary -set, is used as the main metadata region, while the secondary set is -used as a staging area. Modified metadata is first written to the -secondary set and validated by updating the super block in the secondary -set, a generation counter is used to indicate that this set contains the -newest metadata. Once this operation completes, in place of metadata -block updates can be done in the primary metadata set. This ensures that -one of the set is always consistent (all modifications committed or none -at all). Flush operations are used as a commit point. Upon reception of -a flush request, metadata modification activity is temporarily blocked -(for both incoming BIO processing and reclaim process) and all dirty -metadata blocks are staged and updated. Normal operation is then -resumed. Flushing metadata thus only temporarily delays write and -discard requests. Read requests can be processed concurrently while -metadata flush is being executed. - -Usage -===== - -A zoned block device must first be formatted using the dmzadm tool. This -will analyze the device zone configuration, determine where to place the -metadata sets on the device and initialize the metadata sets. - -Ex: - -dmzadm --format /dev/sdxx - -For a formatted device, the target can be created normally with the -dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex: - -echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/device-mapper/era.rst b/Documentation/device-mapper/era.rst new file mode 100644 index 000000000000..90dd5c670b9f --- /dev/null +++ b/Documentation/device-mapper/era.rst @@ -0,0 +1,116 @@ +====== +dm-era +====== + +Introduction +============ + +dm-era is a target that behaves similar to the linear target. In +addition it keeps track of which blocks were written within a user +defined period of time called an 'era'. Each era target instance +maintains the current era as a monotonically increasing 32-bit +counter. + +Use cases include tracking changed blocks for backup software, and +partially invalidating the contents of a cache to restore cache +coherency after rolling back a vendor snapshot. + +Constructor +=========== + +era + + ================ ====================================================== + metadata dev fast device holding the persistent metadata + origin dev device holding data blocks that may change + block size block size of origin data device, granularity that is + tracked by the target + ================ ====================================================== + +Messages +======== + +None of the dm messages take any arguments. + +checkpoint +---------- + +Possibly move to a new era. You shouldn't assume the era has +incremented. After sending this message, you should check the +current era via the status line. + +take_metadata_snap +------------------ + +Create a clone of the metadata, to allow a userland process to read it. + +drop_metadata_snap +------------------ + +Drop the metadata snapshot. + +Status +====== + + <#used metadata blocks>/<#total metadata blocks> + + +========================= ============================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +current era The current era +held metadata root The location, in blocks, of the metadata root + that has been 'held' for userspace read + access. '-' indicates there is no held root +========================= ============================================== + +Detailed use case +================= + +The scenario of invalidating a cache when rolling back a vendor +snapshot was the primary use case when developing this target: + +Taking a vendor snapshot +------------------------ + +- Send a checkpoint message to the era target +- Make a note of the current era in its status line +- Take vendor snapshot (the era and snapshot should be forever + associated now). + +Rolling back to an vendor snapshot +---------------------------------- + +- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) +- Rollback vendor storage +- Take metadata snapshot +- Ascertain which blocks have been written since the snapshot was taken + by checking each block's era +- Invalidate those blocks in the caching software +- Cache returns to writeback/writethrough mode + +Memory usage +============ + +The target uses a bitset to record writes in the current era. It also +has a spare bitset ready for switching over to a new era. Other than +that it uses a few 4k blocks for updating metadata:: + + (4 * nr_blocks) bytes + buffers + +Resilience +========== + +Metadata is updated on disk before a write to a previously unwritten +block is performed. As such dm-era should not be effected by a hard +crash such as power failure. + +Userland tools +============== + +Userland tools are found in the increasingly poorly named +thin-provisioning-tools project: + + https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/device-mapper/era.txt b/Documentation/device-mapper/era.txt deleted file mode 100644 index 3c6d01be3560..000000000000 --- a/Documentation/device-mapper/era.txt +++ /dev/null @@ -1,108 +0,0 @@ -Introduction -============ - -dm-era is a target that behaves similar to the linear target. In -addition it keeps track of which blocks were written within a user -defined period of time called an 'era'. Each era target instance -maintains the current era as a monotonically increasing 32-bit -counter. - -Use cases include tracking changed blocks for backup software, and -partially invalidating the contents of a cache to restore cache -coherency after rolling back a vendor snapshot. - -Constructor -=========== - - era - - metadata dev : fast device holding the persistent metadata - origin dev : device holding data blocks that may change - block size : block size of origin data device, granularity that is - tracked by the target - -Messages -======== - -None of the dm messages take any arguments. - -checkpoint ----------- - -Possibly move to a new era. You shouldn't assume the era has -incremented. After sending this message, you should check the -current era via the status line. - -take_metadata_snap ------------------- - -Create a clone of the metadata, to allow a userland process to read it. - -drop_metadata_snap ------------------- - -Drop the metadata snapshot. - -Status -====== - - <#used metadata blocks>/<#total metadata blocks> - - -metadata block size : Fixed block size for each metadata block in - sectors -#used metadata blocks : Number of metadata blocks used -#total metadata blocks : Total number of metadata blocks -current era : The current era -held metadata root : The location, in blocks, of the metadata root - that has been 'held' for userspace read - access. '-' indicates there is no held root - -Detailed use case -================= - -The scenario of invalidating a cache when rolling back a vendor -snapshot was the primary use case when developing this target: - -Taking a vendor snapshot ------------------------- - -- Send a checkpoint message to the era target -- Make a note of the current era in its status line -- Take vendor snapshot (the era and snapshot should be forever - associated now). - -Rolling back to an vendor snapshot ----------------------------------- - -- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) -- Rollback vendor storage -- Take metadata snapshot -- Ascertain which blocks have been written since the snapshot was taken - by checking each block's era -- Invalidate those blocks in the caching software -- Cache returns to writeback/writethrough mode - -Memory usage -============ - -The target uses a bitset to record writes in the current era. It also -has a spare bitset ready for switching over to a new era. Other than -that it uses a few 4k blocks for updating metadata. - - (4 * nr_blocks) bytes + buffers - -Resilience -========== - -Metadata is updated on disk before a write to a previously unwritten -block is performed. As such dm-era should not be effected by a hard -crash such as power failure. - -Userland tools -============== - -Userland tools are found in the increasingly poorly named -thin-provisioning-tools project: - - https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/device-mapper/index.rst b/Documentation/device-mapper/index.rst new file mode 100644 index 000000000000..105e253bc231 --- /dev/null +++ b/Documentation/device-mapper/index.rst @@ -0,0 +1,44 @@ +:orphan: + +============= +Device Mapper +============= + +.. toctree:: + :maxdepth: 1 + + cache-policies + cache + delay + dm-crypt + dm-flakey + dm-init + dm-integrity + dm-io + dm-log + dm-queue-length + dm-raid + dm-service-time + dm-uevent + dm-zoned + era + kcopyd + linear + log-writes + persistent-data + snapshot + statistics + striped + switch + thin-provisioning + unstriped + verity + writecache + zero + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/device-mapper/kcopyd.rst b/Documentation/device-mapper/kcopyd.rst new file mode 100644 index 000000000000..7651d395127f --- /dev/null +++ b/Documentation/device-mapper/kcopyd.rst @@ -0,0 +1,47 @@ +====== +kcopyd +====== + +Kcopyd provides the ability to copy a range of sectors from one block-device +to one or more other block-devices, with an asynchronous completion +notification. It is used by dm-snapshot and dm-mirror. + +Users of kcopyd must first create a client and indicate how many memory pages +to set aside for their copy jobs. This is done with a call to +kcopyd_client_create():: + + int kcopyd_client_create(unsigned int num_pages, + struct kcopyd_client **result); + +To start a copy job, the user must set up io_region structures to describe +the source and destinations of the copy. Each io_region indicates a +block-device along with the starting sector and size of the region. The source +of the copy is given as one io_region structure, and the destinations of the +copy are given as an array of io_region structures:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +To start the copy, the user calls kcopyd_copy(), passing in the client +pointer, pointers to the source and destination io_regions, the name of a +completion callback routine, and a pointer to some context data for the copy:: + + int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + + typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, + void *context); + +When the copy completes, kcopyd will call the user's completion routine, +passing back the user's context pointer. It will also indicate if a read or +write error occurred during the copy. + +When a user is done with all their copy jobs, they should call +kcopyd_client_destroy() to delete the kcopyd client, which will release the +associated memory pages:: + + void kcopyd_client_destroy(struct kcopyd_client *kc); diff --git a/Documentation/device-mapper/kcopyd.txt b/Documentation/device-mapper/kcopyd.txt deleted file mode 100644 index 820382c4cecf..000000000000 --- a/Documentation/device-mapper/kcopyd.txt +++ /dev/null @@ -1,47 +0,0 @@ -kcopyd -====== - -Kcopyd provides the ability to copy a range of sectors from one block-device -to one or more other block-devices, with an asynchronous completion -notification. It is used by dm-snapshot and dm-mirror. - -Users of kcopyd must first create a client and indicate how many memory pages -to set aside for their copy jobs. This is done with a call to -kcopyd_client_create(). - - int kcopyd_client_create(unsigned int num_pages, - struct kcopyd_client **result); - -To start a copy job, the user must set up io_region structures to describe -the source and destinations of the copy. Each io_region indicates a -block-device along with the starting sector and size of the region. The source -of the copy is given as one io_region structure, and the destinations of the -copy are given as an array of io_region structures. - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -To start the copy, the user calls kcopyd_copy(), passing in the client -pointer, pointers to the source and destination io_regions, the name of a -completion callback routine, and a pointer to some context data for the copy. - - int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, - unsigned int num_dests, struct io_region *dests, - unsigned int flags, kcopyd_notify_fn fn, void *context); - - typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, - void *context); - -When the copy completes, kcopyd will call the user's completion routine, -passing back the user's context pointer. It will also indicate if a read or -write error occurred during the copy. - -When a user is done with all their copy jobs, they should call -kcopyd_client_destroy() to delete the kcopyd client, which will release the -associated memory pages. - - void kcopyd_client_destroy(struct kcopyd_client *kc); - diff --git a/Documentation/device-mapper/linear.rst b/Documentation/device-mapper/linear.rst new file mode 100644 index 000000000000..9d17fc6e64a9 --- /dev/null +++ b/Documentation/device-mapper/linear.rst @@ -0,0 +1,63 @@ +========= +dm-linear +========= + +Device-Mapper's "linear" target maps a linear range of the Device-Mapper +device onto a linear range of another device. This is the basic building +block of logical volume managers. + +Parameters: + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + + +Example scripts +=============== + +:: + + #!/bin/sh + # Create an identity mapping for a device + echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity + +:: + + #!/bin/sh + # Join 2 devices together + size1=`blockdev --getsz $1` + size2=`blockdev --getsz $2` + echo "0 $size1 linear $1 0 + $size1 $size2 linear $2 0" | dmsetup create joined + +:: + + #!/usr/bin/perl -w + # Split a device into 4M chunks and then join them together in reverse order. + + my $name = "reverse"; + my $extent_size = 4 * 1024 * 2; + my $dev = $ARGV[0]; + my $table = ""; + my $count = 0; + + if (!defined($dev)) { + die("Please specify a device.\n"); + } + + my $dev_size = `blockdev --getsz $dev`; + my $extents = int($dev_size / $extent_size) - + (($dev_size % $extent_size) ? 1 : 0); + + while ($extents > 0) { + my $this_start = $count * $extent_size; + $extents--; + $count++; + my $this_offset = $extents * $extent_size; + + $table .= "$this_start $extent_size linear $dev $this_offset\n"; + } + + `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/device-mapper/linear.txt b/Documentation/device-mapper/linear.txt deleted file mode 100644 index 7cb98d89d3f8..000000000000 --- a/Documentation/device-mapper/linear.txt +++ /dev/null @@ -1,61 +0,0 @@ -dm-linear -========= - -Device-Mapper's "linear" target maps a linear range of the Device-Mapper -device onto a linear range of another device. This is the basic building -block of logical volume managers. - -Parameters: - : Full pathname to the underlying block-device, or a - "major:minor" device-number. - : Starting sector within the device. - - -Example scripts -=============== -[[ -#!/bin/sh -# Create an identity mapping for a device -echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity -]] - - -[[ -#!/bin/sh -# Join 2 devices together -size1=`blockdev --getsz $1` -size2=`blockdev --getsz $2` -echo "0 $size1 linear $1 0 -$size1 $size2 linear $2 0" | dmsetup create joined -]] - - -[[ -#!/usr/bin/perl -w -# Split a device into 4M chunks and then join them together in reverse order. - -my $name = "reverse"; -my $extent_size = 4 * 1024 * 2; -my $dev = $ARGV[0]; -my $table = ""; -my $count = 0; - -if (!defined($dev)) { - die("Please specify a device.\n"); -} - -my $dev_size = `blockdev --getsz $dev`; -my $extents = int($dev_size / $extent_size) - - (($dev_size % $extent_size) ? 1 : 0); - -while ($extents > 0) { - my $this_start = $count * $extent_size; - $extents--; - $count++; - my $this_offset = $extents * $extent_size; - - $table .= "$this_start $extent_size linear $dev $this_offset\n"; -} - -`echo \"$table\" | dmsetup create $name`; -]] diff --git a/Documentation/device-mapper/log-writes.rst b/Documentation/device-mapper/log-writes.rst new file mode 100644 index 000000000000..23141f2ffb7c --- /dev/null +++ b/Documentation/device-mapper/log-writes.rst @@ -0,0 +1,145 @@ +============= +dm-log-writes +============= + +This target takes 2 devices, one to pass all IO to normally, and one to log all +of the write operations to. This is intended for file system developers wishing +to verify the integrity of metadata or data as the file system is written to. +There is a log_write_entry written for every WRITE request and the target is +able to take arbitrary data from userspace to insert into the log. The data +that is in the WRITE requests is copied into the log to make the replay happen +exactly as it happened originally. + +Log Ordering +============ + +We log things in order of completion once we are sure the write is no longer in +cache. This means that normal WRITE requests are not actually logged until the +next REQ_PREFLUSH request. This is to make it easier for userspace to replay +the log in a way that correlates to what is on disk and not what is in cache, +to make it easier to detect improper waiting/flushing. + +This works by attaching all WRITE requests to a list once the write completes. +Once we see a REQ_PREFLUSH request we splice this list onto the request and once +the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only +completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to +simulate the worst case scenario with regard to power failures. Consider the +following example (W means write, C means complete): + + W1,W2,W3,C3,C2,Wflush,C1,Cflush + +The log would show the following: + + W3,W2,flush,W1.... + +Again this is to simulate what is actually on disk, this allows us to detect +cases where a power failure at a particular point in time would create an +inconsistent file system. + +Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as +they complete as those requests will obviously bypass the device cache. + +Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would +have all the DISCARD requests, and then the WRITE requests and then the FLUSH +request. Consider the following example: + + WRITE block 1, DISCARD block 1, FLUSH + +If we logged DISCARD when it completed, the replay would look like this: + + DISCARD 1, WRITE 1, FLUSH + +which isn't quite what happened and wouldn't be caught during the log replay. + +Target interface +================ + +i) Constructor + + log-writes + + ============= ============================================== + dev_path Device that all of the IO will go to normally. + log_dev_path Device where the log entries are written to. + ============= ============================================== + +ii) Status + + <#logged entries> + + =========================== ======================== + #logged entries Number of logged entries + highest allocated sector Highest allocated sector + =========================== ======================== + +iii) Messages + + mark + + You can use a dmsetup message to set an arbitrary mark in a log. + For example say you want to fsck a file system after every + write, but first you need to replay up to the mkfs to make sure + we're fsck'ing something reasonable, you would do something like + this:: + + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + + This would allow you to replay the log up to the mkfs mark and + then replay from that point on doing the fsck check in the + interval that you want. + + Every log has a mark at the end labeled "dm-log-writes-end". + +Userspace component +=================== + +There is a userspace tool that will replay the log for you in various ways. +It can be found here: https://github.com/josefbacik/log-writes + +Example usage +============= + +Say you want to test fsync on your file system. You would do something like +this:: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + dmsetup message log 0 mark fsync + md5sum /mnt/btrfs-test/foo + umount /mnt/btrfs-test + + dmsetup remove log + replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync + mount /dev/sdb /mnt/btrfs-test + md5sum /mnt/btrfs-test/foo + + + Another option is to do a complicated file system operation and verify the file + system is consistent during the entire operation. You could do this with: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + btrfs filesystem balance /mnt/btrfs-test + umount /mnt/btrfs-test + dmsetup remove log + + replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs + btrfsck /dev/sdb + replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ + --fsck "btrfsck /dev/sdb" --check fua + +And that will replay the log until it sees a FUA request, run the fsck command +and if the fsck passes it will replay to the next FUA, until it is completed or +the fsck command exists abnormally. diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt deleted file mode 100644 index b638d124be6a..000000000000 --- a/Documentation/device-mapper/log-writes.txt +++ /dev/null @@ -1,140 +0,0 @@ -dm-log-writes -============= - -This target takes 2 devices, one to pass all IO to normally, and one to log all -of the write operations to. This is intended for file system developers wishing -to verify the integrity of metadata or data as the file system is written to. -There is a log_write_entry written for every WRITE request and the target is -able to take arbitrary data from userspace to insert into the log. The data -that is in the WRITE requests is copied into the log to make the replay happen -exactly as it happened originally. - -Log Ordering -============ - -We log things in order of completion once we are sure the write is no longer in -cache. This means that normal WRITE requests are not actually logged until the -next REQ_PREFLUSH request. This is to make it easier for userspace to replay -the log in a way that correlates to what is on disk and not what is in cache, -to make it easier to detect improper waiting/flushing. - -This works by attaching all WRITE requests to a list once the write completes. -Once we see a REQ_PREFLUSH request we splice this list onto the request and once -the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only -completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to -simulate the worst case scenario with regard to power failures. Consider the -following example (W means write, C means complete): - -W1,W2,W3,C3,C2,Wflush,C1,Cflush - -The log would show the following - -W3,W2,flush,W1.... - -Again this is to simulate what is actually on disk, this allows us to detect -cases where a power failure at a particular point in time would create an -inconsistent file system. - -Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as -they complete as those requests will obviously bypass the device cache. - -Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would -have all the DISCARD requests, and then the WRITE requests and then the FLUSH -request. Consider the following example: - -WRITE block 1, DISCARD block 1, FLUSH - -If we logged DISCARD when it completed, the replay would look like this - -DISCARD 1, WRITE 1, FLUSH - -which isn't quite what happened and wouldn't be caught during the log replay. - -Target interface -================ - -i) Constructor - - log-writes - - dev_path : Device that all of the IO will go to normally. - log_dev_path : Device where the log entries are written to. - -ii) Status - - <#logged entries> - - #logged entries : Number of logged entries - highest allocated sector : Highest allocated sector - -iii) Messages - - mark - - You can use a dmsetup message to set an arbitrary mark in a log. - For example say you want to fsck a file system after every - write, but first you need to replay up to the mkfs to make sure - we're fsck'ing something reasonable, you would do something like - this: - - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - - This would allow you to replay the log up to the mkfs mark and - then replay from that point on doing the fsck check in the - interval that you want. - - Every log has a mark at the end labeled "dm-log-writes-end". - -Userspace component -=================== - -There is a userspace tool that will replay the log for you in various ways. -It can be found here: https://github.com/josefbacik/log-writes - -Example usage -============= - -Say you want to test fsync on your file system. You would do something like -this: - -TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" -dmsetup create log --table "$TABLE" -mkfs.btrfs -f /dev/mapper/log -dmsetup message log 0 mark mkfs - -mount /dev/mapper/log /mnt/btrfs-test - -dmsetup message log 0 mark fsync -md5sum /mnt/btrfs-test/foo -umount /mnt/btrfs-test - -dmsetup remove log -replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync -mount /dev/sdb /mnt/btrfs-test -md5sum /mnt/btrfs-test/foo - - -Another option is to do a complicated file system operation and verify the file -system is consistent during the entire operation. You could do this with: - -TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" -dmsetup create log --table "$TABLE" -mkfs.btrfs -f /dev/mapper/log -dmsetup message log 0 mark mkfs - -mount /dev/mapper/log /mnt/btrfs-test - -btrfs filesystem balance /mnt/btrfs-test -umount /mnt/btrfs-test -dmsetup remove log - -replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs -btrfsck /dev/sdb -replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ - --fsck "btrfsck /dev/sdb" --check fua - -And that will replay the log until it sees a FUA request, run the fsck command -and if the fsck passes it will replay to the next FUA, until it is completed or -the fsck command exists abnormally. diff --git a/Documentation/device-mapper/persistent-data.rst b/Documentation/device-mapper/persistent-data.rst new file mode 100644 index 000000000000..2065c3c5a091 --- /dev/null +++ b/Documentation/device-mapper/persistent-data.rst @@ -0,0 +1,88 @@ +=============== +Persistent data +=============== + +Introduction +============ + +The more-sophisticated device-mapper targets require complex metadata +that is managed in kernel. In late 2010 we were seeing that various +different targets were rolling their own data structures, for example: + +- Mikulas Patocka's multisnap implementation +- Heinz Mauelshagen's thin provisioning target +- Another btree-based caching target posted to dm-devel +- Another multi-snapshot target based on a design of Daniel Phillips + +Maintaining these data structures takes a lot of work, so if possible +we'd like to reduce the number. + +The persistent-data library is an attempt to provide a re-usable +framework for people who want to store metadata in device-mapper +targets. It's currently used by the thin-provisioning target and an +upcoming hierarchical storage target. + +Overview +======== + +The main documentation is in the header files which can all be found +under drivers/md/persistent-data. + +The block manager +----------------- + +dm-block-manager.[hc] + +This provides access to the data on disk in fixed sized-blocks. There +is a read/write locking interface to prevent concurrent accesses, and +keep data that is being used in the cache. + +Clients of persistent-data are unlikely to use this directly. + +The transaction manager +----------------------- + +dm-transaction-manager.[hc] + +This restricts access to blocks and enforces copy-on-write semantics. +The only way you can get hold of a writable block through the +transaction manager is by shadowing an existing block (ie. doing +copy-on-write) or allocating a fresh one. Shadowing is elided within +the same transaction so performance is reasonable. The commit method +ensures that all data is flushed before it writes the superblock. +On power failure your metadata will be as it was when last committed. + +The Space Maps +-------------- + +dm-space-map.h +dm-space-map-metadata.[hc] +dm-space-map-disk.[hc] + +On-disk data structures that keep track of reference counts of blocks. +Also acts as the allocator of new blocks. Currently two +implementations: a simpler one for managing blocks on a different +device (eg. thinly-provisioned data blocks); and one for managing +the metadata space. The latter is complicated by the need to store +its own data within the space it's managing. + +The data structures +------------------- + +dm-btree.[hc] +dm-btree-remove.c +dm-btree-spine.c +dm-btree-internal.h + +Currently there is only one data structure, a hierarchical btree. +There are plans to add more. For example, something with an +array-like interface would see a lot of use. + +The btree is 'hierarchical' in that you can define it to be composed +of nested btrees, and take multiple keys. For example, the +thin-provisioning target uses a btree with two levels of nesting. +The first maps a device id to a mapping tree, and that in turn maps a +virtual block to a physical block. + +Values stored in the btrees can have arbitrary size. Keys are always +64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.txt deleted file mode 100644 index a333bcb3a6c2..000000000000 --- a/Documentation/device-mapper/persistent-data.txt +++ /dev/null @@ -1,84 +0,0 @@ -Introduction -============ - -The more-sophisticated device-mapper targets require complex metadata -that is managed in kernel. In late 2010 we were seeing that various -different targets were rolling their own data structures, for example: - -- Mikulas Patocka's multisnap implementation -- Heinz Mauelshagen's thin provisioning target -- Another btree-based caching target posted to dm-devel -- Another multi-snapshot target based on a design of Daniel Phillips - -Maintaining these data structures takes a lot of work, so if possible -we'd like to reduce the number. - -The persistent-data library is an attempt to provide a re-usable -framework for people who want to store metadata in device-mapper -targets. It's currently used by the thin-provisioning target and an -upcoming hierarchical storage target. - -Overview -======== - -The main documentation is in the header files which can all be found -under drivers/md/persistent-data. - -The block manager ------------------ - -dm-block-manager.[hc] - -This provides access to the data on disk in fixed sized-blocks. There -is a read/write locking interface to prevent concurrent accesses, and -keep data that is being used in the cache. - -Clients of persistent-data are unlikely to use this directly. - -The transaction manager ------------------------ - -dm-transaction-manager.[hc] - -This restricts access to blocks and enforces copy-on-write semantics. -The only way you can get hold of a writable block through the -transaction manager is by shadowing an existing block (ie. doing -copy-on-write) or allocating a fresh one. Shadowing is elided within -the same transaction so performance is reasonable. The commit method -ensures that all data is flushed before it writes the superblock. -On power failure your metadata will be as it was when last committed. - -The Space Maps --------------- - -dm-space-map.h -dm-space-map-metadata.[hc] -dm-space-map-disk.[hc] - -On-disk data structures that keep track of reference counts of blocks. -Also acts as the allocator of new blocks. Currently two -implementations: a simpler one for managing blocks on a different -device (eg. thinly-provisioned data blocks); and one for managing -the metadata space. The latter is complicated by the need to store -its own data within the space it's managing. - -The data structures -------------------- - -dm-btree.[hc] -dm-btree-remove.c -dm-btree-spine.c -dm-btree-internal.h - -Currently there is only one data structure, a hierarchical btree. -There are plans to add more. For example, something with an -array-like interface would see a lot of use. - -The btree is 'hierarchical' in that you can define it to be composed -of nested btrees, and take multiple keys. For example, the -thin-provisioning target uses a btree with two levels of nesting. -The first maps a device id to a mapping tree, and that in turn maps a -virtual block to a physical block. - -Values stored in the btrees can have arbitrary size. Keys are always -64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/device-mapper/snapshot.rst b/Documentation/device-mapper/snapshot.rst new file mode 100644 index 000000000000..4c53304e72f1 --- /dev/null +++ b/Documentation/device-mapper/snapshot.rst @@ -0,0 +1,180 @@ +============================== +Device-mapper snapshot support +============================== + +Device-mapper allows you, without massive data copying: + +- To create snapshots of any block device i.e. mountable, saved states of + the block device which are also writable without interfering with the + original content; +- To create device "forks", i.e. multiple different versions of the + same data stream. +- To merge a snapshot of a block device back into the snapshot's origin + device. + +In the first two cases, dm copies only the chunks of data that get +changed and uses a separate copy-on-write (COW) block device for +storage. + +For snapshot merge the contents of the COW storage are merged back into +the origin device. + + +There are three dm targets available: +snapshot, snapshot-origin, and snapshot-merge. + +- snapshot-origin + +which will normally have one or more snapshots based on it. +Reads will be mapped directly to the backing device. For each write, the +original data will be saved in the of each snapshot to keep +its visible content unchanged, at least until the fills up. + + +- snapshot + +A snapshot of the block device is created. Changed chunks of + sectors will be stored on the . Writes will +only go to the . Reads will come from the or +from for unchanged data. will often be +smaller than the origin and if it fills up the snapshot will become +useless and be disabled, returning errors. So it is important to monitor +the amount of free space and expand the before it fills up. + + is P (Persistent) or N (Not persistent - will not survive +after reboot). O (Overflow) can be added as a persistent store option +to allow userspace to advertise its support for seeing "Overflow" in the +snapshot status. So supported store types are "P", "PO" and "N". + +The difference between persistent and transient is with transient +snapshots less metadata must be saved on disk - they can be kept in +memory by the kernel. + +When loading or unloading the snapshot target, the corresponding +snapshot-origin or snapshot-merge target must be suspended. A failure to +suspend the origin target could result in data corruption. + + +* snapshot-merge + +takes the same table arguments as the snapshot target except it only +works with persistent snapshots. This target assumes the role of the +"snapshot-origin" target and must not be loaded if the "snapshot-origin" +is still present for . + +Creates a merging snapshot that takes control of the changed chunks +stored in the of an existing snapshot, through a handover +procedure, and merges these chunks back into the . Once merging +has started (in the background) the may be opened and the merge +will continue while I/O is flowing to it. Changes to the are +deferred until the merging snapshot's corresponding chunk(s) have been +merged. Once merging has started the snapshot device, associated with +the "snapshot" target, will return -EIO when accessed. + + +How snapshot is used by LVM2 +============================ +When you create the first LVM2 snapshot of a volume, four dm devices are used: + +1) a device containing the original mapping table of the source volume; +2) a device used as the ; +3) a "snapshot" device, combining #1 and #2, which is the visible snapshot + volume; +4) the "original" volume (which uses the device number used by the original + source volume), whose table is replaced by a "snapshot-origin" mapping + from device #1. + +A fixed naming scheme is used, so with the following commands:: + + lvcreate -L 1G -n base volumeGroup + lvcreate -L 100M --snapshot -n snap volumeGroup/base + +we'll have this situation (with volumes in above order):: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 + volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 + volumeGroup-base: 0 2097152 snapshot-origin 254:11 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow + brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap + brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + + +How snapshot-merge is used by LVM2 +================================== +A merging snapshot assumes the role of the "snapshot-origin" while +merging. As such the "snapshot-origin" is replaced with +"snapshot-merge". The "-real" device is not changed and the "-cow" +device is renamed to -cow to aid LVM2's cleanup of the +merging snapshot after it completes. The "snapshot" that hands over its +COW device to the "snapshot-merge" is deactivated (unless using lvchange +--refresh); but if it is left active it will simply return I/O errors. + +A snapshot will merge into its origin with the following command:: + + lvconvert --merge volumeGroup/snap + +we'll now have this situation:: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-base-cow: 0 204800 linear 8:19 2097536 + volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow + brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base + + +How to determine when a merging is complete +=========================================== +The snapshot-merge and snapshot status lines end with: + + / + +Both and include both data and metadata. +During merging, the number of sectors allocated gets smaller and +smaller. Merging has finished when the number of sectors holding data +is zero, in other words == . + +Here is a practical example (using a hybrid of lvm and dmsetup commands):: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g + snap volumeGroup swi-a- 1.00g base 18.97 + + # dmsetup status volumeGroup-snap + 0 8388608 snapshot 397896/2097152 1560 + ^^^^ metadata sectors + + # lvconvert --merge -b volumeGroup/snap + Merging of volume snap started. + + # lvs volumeGroup/snap + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup Owi-a- 4.00g 17.23 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 281688/2097152 1104 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 180480/2097152 712 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 16/2097152 16 + +Merging has finished. + +:: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt deleted file mode 100644 index b8bbb516f989..000000000000 --- a/Documentation/device-mapper/snapshot.txt +++ /dev/null @@ -1,176 +0,0 @@ -Device-mapper snapshot support -============================== - -Device-mapper allows you, without massive data copying: - -*) To create snapshots of any block device i.e. mountable, saved states of -the block device which are also writable without interfering with the -original content; -*) To create device "forks", i.e. multiple different versions of the -same data stream. -*) To merge a snapshot of a block device back into the snapshot's origin -device. - -In the first two cases, dm copies only the chunks of data that get -changed and uses a separate copy-on-write (COW) block device for -storage. - -For snapshot merge the contents of the COW storage are merged back into -the origin device. - - -There are three dm targets available: -snapshot, snapshot-origin, and snapshot-merge. - -*) snapshot-origin - -which will normally have one or more snapshots based on it. -Reads will be mapped directly to the backing device. For each write, the -original data will be saved in the of each snapshot to keep -its visible content unchanged, at least until the fills up. - - -*) snapshot - -A snapshot of the block device is created. Changed chunks of - sectors will be stored on the . Writes will -only go to the . Reads will come from the or -from for unchanged data. will often be -smaller than the origin and if it fills up the snapshot will become -useless and be disabled, returning errors. So it is important to monitor -the amount of free space and expand the before it fills up. - - is P (Persistent) or N (Not persistent - will not survive -after reboot). O (Overflow) can be added as a persistent store option -to allow userspace to advertise its support for seeing "Overflow" in the -snapshot status. So supported store types are "P", "PO" and "N". - -The difference between persistent and transient is with transient -snapshots less metadata must be saved on disk - they can be kept in -memory by the kernel. - -When loading or unloading the snapshot target, the corresponding -snapshot-origin or snapshot-merge target must be suspended. A failure to -suspend the origin target could result in data corruption. - - -* snapshot-merge - -takes the same table arguments as the snapshot target except it only -works with persistent snapshots. This target assumes the role of the -"snapshot-origin" target and must not be loaded if the "snapshot-origin" -is still present for . - -Creates a merging snapshot that takes control of the changed chunks -stored in the of an existing snapshot, through a handover -procedure, and merges these chunks back into the . Once merging -has started (in the background) the may be opened and the merge -will continue while I/O is flowing to it. Changes to the are -deferred until the merging snapshot's corresponding chunk(s) have been -merged. Once merging has started the snapshot device, associated with -the "snapshot" target, will return -EIO when accessed. - - -How snapshot is used by LVM2 -============================ -When you create the first LVM2 snapshot of a volume, four dm devices are used: - -1) a device containing the original mapping table of the source volume; -2) a device used as the ; -3) a "snapshot" device, combining #1 and #2, which is the visible snapshot - volume; -4) the "original" volume (which uses the device number used by the original - source volume), whose table is replaced by a "snapshot-origin" mapping - from device #1. - -A fixed naming scheme is used, so with the following commands: - -lvcreate -L 1G -n base volumeGroup -lvcreate -L 100M --snapshot -n snap volumeGroup/base - -we'll have this situation (with volumes in above order): - -# dmsetup table|grep volumeGroup - -volumeGroup-base-real: 0 2097152 linear 8:19 384 -volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 -volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 -volumeGroup-base: 0 2097152 snapshot-origin 254:11 - -# ls -lL /dev/mapper/volumeGroup-* -brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real -brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow -brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap -brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base - - -How snapshot-merge is used by LVM2 -================================== -A merging snapshot assumes the role of the "snapshot-origin" while -merging. As such the "snapshot-origin" is replaced with -"snapshot-merge". The "-real" device is not changed and the "-cow" -device is renamed to -cow to aid LVM2's cleanup of the -merging snapshot after it completes. The "snapshot" that hands over its -COW device to the "snapshot-merge" is deactivated (unless using lvchange ---refresh); but if it is left active it will simply return I/O errors. - -A snapshot will merge into its origin with the following command: - -lvconvert --merge volumeGroup/snap - -we'll now have this situation: - -# dmsetup table|grep volumeGroup - -volumeGroup-base-real: 0 2097152 linear 8:19 384 -volumeGroup-base-cow: 0 204800 linear 8:19 2097536 -volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 - -# ls -lL /dev/mapper/volumeGroup-* -brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real -brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow -brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base - - -How to determine when a merging is complete -=========================================== -The snapshot-merge and snapshot status lines end with: - / - -Both and include both data and metadata. -During merging, the number of sectors allocated gets smaller and -smaller. Merging has finished when the number of sectors holding data -is zero, in other words == . - -Here is a practical example (using a hybrid of lvm and dmsetup commands): - -# lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g - snap volumeGroup swi-a- 1.00g base 18.97 - -# dmsetup status volumeGroup-snap -0 8388608 snapshot 397896/2097152 1560 - ^^^^ metadata sectors - -# lvconvert --merge -b volumeGroup/snap - Merging of volume snap started. - -# lvs volumeGroup/snap - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup Owi-a- 4.00g 17.23 - -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 281688/2097152 1104 - -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 180480/2097152 712 - -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 16/2097152 16 - -Merging has finished. - -# lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g diff --git a/Documentation/device-mapper/statistics.rst b/Documentation/device-mapper/statistics.rst new file mode 100644 index 000000000000..3d80a9f850cc --- /dev/null +++ b/Documentation/device-mapper/statistics.rst @@ -0,0 +1,225 @@ +============= +DM statistics +============= + +Device Mapper supports the collection of I/O statistics on user-defined +regions of a DM device. If no regions are defined no statistics are +collected so there isn't any performance impact. Only bio-based DM +devices are currently supported. + +Each user-defined region specifies a starting sector, length and step. +Individual statistics will be collected for each step-sized area within +the range specified. + +The I/O statistics counters for each step-sized area of a region are +in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: +Documentation/iostats.txt). But two extra counters (12 and 13) are +provided: total time spent reading and writing. When the histogram +argument is used, the 14th parameter is reported that represents the +histogram of latencies. All these counters may be accessed by sending +the @stats_print message to the appropriate DM device via dmsetup. + +The reported times are in milliseconds and the granularity depends on +the kernel ticks. When the option precise_timestamps is used, the +reported times are in nanoseconds. + +Each region has a corresponding unique identifier, which we call a +region_id, that is assigned when the region is created. The region_id +must be supplied when querying statistics about the region, deleting the +region, etc. Unique region_ids enable multiple userspace programs to +request and process statistics for the same DM device without stepping +on each other's data. + +The creation of DM statistics will allocate memory via kmalloc or +fallback to using vmalloc space. At most, 1/4 of the overall system +memory may be allocated by DM statistics. The admin can see how much +memory is used by reading: + + /sys/module/dm_mod/parameters/stats_current_allocated_bytes + +Messages +======== + + @stats_create [ ...] [ []] + Create a new region and return the region_id. + + + "-" + whole device + "+" + a range of 512-byte sectors + starting with . + + + "" + the range is subdivided into areas each containing + sectors. + "/" + the range is subdivided into the specified + number of areas. + + + The number of optional arguments + + + The following optional arguments are supported: + + precise_timestamps + use precise timer with nanosecond resolution + instead of the "jiffies" variable. When this argument is + used, the resulting times are in nanoseconds instead of + milliseconds. Precise timestamps are a little bit slower + to obtain than jiffies-based timestamps. + histogram:n1,n2,n3,n4,... + collect histogram of latencies. The + numbers n1, n2, etc are times that represent the boundaries + of the histogram. If precise_timestamps is not used, the + times are in milliseconds, otherwise they are in + nanoseconds. For each range, the kernel will report the + number of requests that completed within this range. For + example, if we use "histogram:10,20,30", the kernel will + report four numbers a:b:c:d. a is the number of requests + that took 0-10 ms to complete, b is the number of requests + that took 10-20 ms to complete, c is the number of requests + that took 20-30 ms to complete and d is the number of + requests that took more than 30 ms to complete. + + + An optional parameter. A name that uniquely identifies + the userspace owner of the range. This groups ranges together + so that userspace programs can identify the ranges they + created and ignore those created by others. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use it for anything else. + If we omit the number of optional arguments, program id must not + be a number, otherwise it would be interpreted as the number of + optional arguments. + + + An optional parameter. A word that provides auxiliary data + that is useful to the client program that created the range. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use this value for anything. + + @stats_delete + Delete the region with the specified id. + + + region_id returned from @stats_create + + @stats_clear + Clear all the counters except the in-flight i/o counters. + + + region_id returned from @stats_create + + @stats_list [] + List all regions registered with @stats_create. + + + An optional parameter. + If this parameter is specified, only matching regions + are returned. + If it is not specified, all regions are returned. + + Output format: + : + + precise_timestamps histogram:n1,n2,n3,... + + The strings "precise_timestamps" and "histogram" are printed only + if they were specified when creating the region. + + @stats_print [ ] + Print counters for each step-sized area of a region. + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are returned. + + + The number of lines to include in the output. + If omitted, all lines are returned. + + Output format for each step-sized area of a region: + + + + counters + + The first 11 counters have the same meaning as + `/sys/block/*/stat or /proc/diskstats`. + + Please refer to Documentation/iostats.txt for details. + + 1. the number of reads completed + 2. the number of reads merged + 3. the number of sectors read + 4. the number of milliseconds spent reading + 5. the number of writes completed + 6. the number of writes merged + 7. the number of sectors written + 8. the number of milliseconds spent writing + 9. the number of I/Os currently in progress + 10. the number of milliseconds spent doing I/Os + 11. the weighted number of milliseconds spent doing I/Os + + Additional counters: + + 12. the total time spent reading in milliseconds + 13. the total time spent writing in milliseconds + + @stats_print_clear [ ] + Atomically print and then clear all the counters except the + in-flight i/o counters. Useful when the client consuming the + statistics does not want to lose any statistics (those updated + between printing and clearing). + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are printed and then cleared. + + + The number of lines to process. + If omitted, all lines are printed and then cleared. + + @stats_set_aux + Store auxiliary data aux_data for the specified region. + + + region_id returned from @stats_create + + + The string that identifies data which is useful to the client + program that created the range. The kernel returns this + string back in the output of @stats_list message, but it + doesn't use this value for anything. + +Examples +======== + +Subdivide the DM device 'vol' into 100 pieces and start collecting +statistics on them:: + + dmsetup message vol 0 @stats_create - /100 + +Set the auxiliary data string to "foo bar baz" (the escape for each +space must also be escaped, otherwise the shell will consume them):: + + dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz + +List the statistics:: + + dmsetup message vol 0 @stats_list + +Print the statistics:: + + dmsetup message vol 0 @stats_print 0 + +Delete the statistics:: + + dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt deleted file mode 100644 index 170ac02a1f50..000000000000 --- a/Documentation/device-mapper/statistics.txt +++ /dev/null @@ -1,223 +0,0 @@ -DM statistics -============= - -Device Mapper supports the collection of I/O statistics on user-defined -regions of a DM device. If no regions are defined no statistics are -collected so there isn't any performance impact. Only bio-based DM -devices are currently supported. - -Each user-defined region specifies a starting sector, length and step. -Individual statistics will be collected for each step-sized area within -the range specified. - -The I/O statistics counters for each step-sized area of a region are -in the same format as /sys/block/*/stat or /proc/diskstats (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are -provided: total time spent reading and writing. When the histogram -argument is used, the 14th parameter is reported that represents the -histogram of latencies. All these counters may be accessed by sending -the @stats_print message to the appropriate DM device via dmsetup. - -The reported times are in milliseconds and the granularity depends on -the kernel ticks. When the option precise_timestamps is used, the -reported times are in nanoseconds. - -Each region has a corresponding unique identifier, which we call a -region_id, that is assigned when the region is created. The region_id -must be supplied when querying statistics about the region, deleting the -region, etc. Unique region_ids enable multiple userspace programs to -request and process statistics for the same DM device without stepping -on each other's data. - -The creation of DM statistics will allocate memory via kmalloc or -fallback to using vmalloc space. At most, 1/4 of the overall system -memory may be allocated by DM statistics. The admin can see how much -memory is used by reading -/sys/module/dm_mod/parameters/stats_current_allocated_bytes - -Messages -======== - - @stats_create - [ ...] - [ []] - - Create a new region and return the region_id. - - - "-" - whole device - "+" - a range of 512-byte sectors - starting with . - - - "" - the range is subdivided into areas each containing - sectors. - "/" - the range is subdivided into the specified - number of areas. - - - The number of optional arguments - - - The following optional arguments are supported - precise_timestamps - use precise timer with nanosecond resolution - instead of the "jiffies" variable. When this argument is - used, the resulting times are in nanoseconds instead of - milliseconds. Precise timestamps are a little bit slower - to obtain than jiffies-based timestamps. - histogram:n1,n2,n3,n4,... - collect histogram of latencies. The - numbers n1, n2, etc are times that represent the boundaries - of the histogram. If precise_timestamps is not used, the - times are in milliseconds, otherwise they are in - nanoseconds. For each range, the kernel will report the - number of requests that completed within this range. For - example, if we use "histogram:10,20,30", the kernel will - report four numbers a:b:c:d. a is the number of requests - that took 0-10 ms to complete, b is the number of requests - that took 10-20 ms to complete, c is the number of requests - that took 20-30 ms to complete and d is the number of - requests that took more than 30 ms to complete. - - - An optional parameter. A name that uniquely identifies - the userspace owner of the range. This groups ranges together - so that userspace programs can identify the ranges they - created and ignore those created by others. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use it for anything else. - If we omit the number of optional arguments, program id must not - be a number, otherwise it would be interpreted as the number of - optional arguments. - - - An optional parameter. A word that provides auxiliary data - that is useful to the client program that created the range. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use this value for anything. - - @stats_delete - - Delete the region with the specified id. - - - region_id returned from @stats_create - - @stats_clear - - Clear all the counters except the in-flight i/o counters. - - - region_id returned from @stats_create - - @stats_list [] - - List all regions registered with @stats_create. - - - An optional parameter. - If this parameter is specified, only matching regions - are returned. - If it is not specified, all regions are returned. - - Output format: - : + - precise_timestamps histogram:n1,n2,n3,... - - The strings "precise_timestamps" and "histogram" are printed only - if they were specified when creating the region. - - @stats_print [ ] - - Print counters for each step-sized area of a region. - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are returned. - - - The number of lines to include in the output. - If omitted, all lines are returned. - - Output format for each step-sized area of a region: - - + counters - - The first 11 counters have the same meaning as - /sys/block/*/stat or /proc/diskstats. - - Please refer to Documentation/iostats.txt for details. - - 1. the number of reads completed - 2. the number of reads merged - 3. the number of sectors read - 4. the number of milliseconds spent reading - 5. the number of writes completed - 6. the number of writes merged - 7. the number of sectors written - 8. the number of milliseconds spent writing - 9. the number of I/Os currently in progress - 10. the number of milliseconds spent doing I/Os - 11. the weighted number of milliseconds spent doing I/Os - - Additional counters: - 12. the total time spent reading in milliseconds - 13. the total time spent writing in milliseconds - - @stats_print_clear [ ] - - Atomically print and then clear all the counters except the - in-flight i/o counters. Useful when the client consuming the - statistics does not want to lose any statistics (those updated - between printing and clearing). - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are printed and then cleared. - - - The number of lines to process. - If omitted, all lines are printed and then cleared. - - @stats_set_aux - - Store auxiliary data aux_data for the specified region. - - - region_id returned from @stats_create - - - The string that identifies data which is useful to the client - program that created the range. The kernel returns this - string back in the output of @stats_list message, but it - doesn't use this value for anything. - -Examples -======== - -Subdivide the DM device 'vol' into 100 pieces and start collecting -statistics on them: - - dmsetup message vol 0 @stats_create - /100 - -Set the auxiliary data string to "foo bar baz" (the escape for each -space must also be escaped, otherwise the shell will consume them): - - dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz - -List the statistics: - - dmsetup message vol 0 @stats_list - -Print the statistics: - - dmsetup message vol 0 @stats_print 0 - -Delete the statistics: - - dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/device-mapper/striped.rst b/Documentation/device-mapper/striped.rst new file mode 100644 index 000000000000..e9a8da192ae1 --- /dev/null +++ b/Documentation/device-mapper/striped.rst @@ -0,0 +1,61 @@ +========= +dm-stripe +========= + +Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) +device across one or more underlying devices. Data is written in "chunks", +with consecutive chunks rotating among the underlying devices. This can +potentially provide improved I/O throughput by utilizing several physical +devices in parallel. + +Parameters: [ ]+ + : + Number of underlying devices. + : + Size of each chunk of data. Must be at least as + large as the system's PAGE_SIZE. + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + +One or more underlying devices can be specified. The striped device size must +be a multiple of the chunk size multiplied by the number of underlying devices. + + +Example scripts +=============== + +:: + + #!/usr/bin/perl -w + # Create a striped device across any number of underlying devices. The device + # will be called "stripe_dev" and have a chunk-size of 128k. + + my $chunk_size = 128 * 2; + my $dev_name = "stripe_dev"; + my $num_devs = @ARGV; + my @devs = @ARGV; + my ($min_dev_size, $stripe_dev_size, $i); + + if (!$num_devs) { + die("Specify at least one device\n"); + } + + $min_dev_size = `blockdev --getsz $devs[0]`; + for ($i = 1; $i < $num_devs; $i++) { + my $this_size = `blockdev --getsz $devs[$i]`; + $min_dev_size = ($min_dev_size < $this_size) ? + $min_dev_size : $this_size; + } + + $stripe_dev_size = $min_dev_size * $num_devs; + $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); + + $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; + for ($i = 0; $i < $num_devs; $i++) { + $table .= " $devs[$i] 0"; + } + + `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt deleted file mode 100644 index 07ec492cceee..000000000000 --- a/Documentation/device-mapper/striped.txt +++ /dev/null @@ -1,57 +0,0 @@ -dm-stripe -========= - -Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) -device across one or more underlying devices. Data is written in "chunks", -with consecutive chunks rotating among the underlying devices. This can -potentially provide improved I/O throughput by utilizing several physical -devices in parallel. - -Parameters: [ ]+ - : Number of underlying devices. - : Size of each chunk of data. Must be at least as - large as the system's PAGE_SIZE. - : Full pathname to the underlying block-device, or a - "major:minor" device-number. - : Starting sector within the device. - -One or more underlying devices can be specified. The striped device size must -be a multiple of the chunk size multiplied by the number of underlying devices. - - -Example scripts -=============== - -[[ -#!/usr/bin/perl -w -# Create a striped device across any number of underlying devices. The device -# will be called "stripe_dev" and have a chunk-size of 128k. - -my $chunk_size = 128 * 2; -my $dev_name = "stripe_dev"; -my $num_devs = @ARGV; -my @devs = @ARGV; -my ($min_dev_size, $stripe_dev_size, $i); - -if (!$num_devs) { - die("Specify at least one device\n"); -} - -$min_dev_size = `blockdev --getsz $devs[0]`; -for ($i = 1; $i < $num_devs; $i++) { - my $this_size = `blockdev --getsz $devs[$i]`; - $min_dev_size = ($min_dev_size < $this_size) ? - $min_dev_size : $this_size; -} - -$stripe_dev_size = $min_dev_size * $num_devs; -$stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); - -$table = "0 $stripe_dev_size striped $num_devs $chunk_size"; -for ($i = 0; $i < $num_devs; $i++) { - $table .= " $devs[$i] 0"; -} - -`echo $table | dmsetup create $dev_name`; -]] - diff --git a/Documentation/device-mapper/switch.rst b/Documentation/device-mapper/switch.rst new file mode 100644 index 000000000000..7dde06be1a4f --- /dev/null +++ b/Documentation/device-mapper/switch.rst @@ -0,0 +1,141 @@ +========= +dm-switch +========= + +The device-mapper switch target creates a device that supports an +arbitrary mapping of fixed-size regions of I/O across a fixed set of +paths. The path used for any specific region can be switched +dynamically by sending the target a message. + +It maps I/O to underlying block devices efficiently when there is a large +number of fixed-sized address regions but there is no simple pattern +that would allow for a compact representation of the mapping such as +dm-stripe. + +Background +---------- + +Dell EqualLogic and some other iSCSI storage arrays use a distributed +frameless architecture. In this architecture, the storage group +consists of a number of distinct storage arrays ("members") each having +independent controllers, disk storage and network adapters. When a LUN +is created it is spread across multiple members. The details of the +spreading are hidden from initiators connected to this storage system. +The storage group exposes a single target discovery portal, no matter +how many members are being used. When iSCSI sessions are created, each +session is connected to an eth port on a single member. Data to a LUN +can be sent on any iSCSI session, and if the blocks being accessed are +stored on another member the I/O will be forwarded as required. This +forwarding is invisible to the initiator. The storage layout is also +dynamic, and the blocks stored on disk may be moved from member to +member as needed to balance the load. + +This architecture simplifies the management and configuration of both +the storage group and initiators. In a multipathing configuration, it +is possible to set up multiple iSCSI sessions to use multiple network +interfaces on both the host and target to take advantage of the +increased network bandwidth. An initiator could use a simple round +robin algorithm to send I/O across all paths and let the storage array +members forward it as necessary, but there is a performance advantage to +sending data directly to the correct member. + +A device-mapper table already lets you map different regions of a +device onto different targets. However in this architecture the LUN is +spread with an address region size on the order of 10s of MBs, which +means the resulting table could have more than a million entries and +consume far too much memory. + +Using this device-mapper switch target we can now build a two-layer +device hierarchy: + + Upper Tier - Determine which array member the I/O should be sent to. + Lower Tier - Load balance amongst paths to a particular member. + +The lower tier consists of a single dm multipath device for each member. +Each of these multipath devices contains the set of paths directly to +the array member in one priority group, and leverages existing path +selectors to load balance amongst these paths. We also build a +non-preferred priority group containing paths to other array members for +failover reasons. + +The upper tier consists of a single dm-switch device. This device uses +a bitmap to look up the location of the I/O and choose the appropriate +lower tier device to route the I/O. By using a bitmap we are able to +use 4 bits for each address range in a 16 member group (which is very +large for us). This is a much denser representation than the dm table +b-tree can achieve. + +Construction Parameters +======================= + + [...] [ ]+ + + The number of paths across which to distribute the I/O. + + + The number of 512-byte sectors in a region. Each region can be redirected + to any of the available paths. + + + The number of optional arguments. Currently, no optional arguments + are supported and so this must be zero. + + + The block device that represents a specific path to the device. + + + The offset of the start of data on the specific (in units + of 512-byte sectors). This number is added to the sector number when + forwarding the request to the specific path. Typically it is zero. + +Messages +======== + +set_region_mappings : []: []:... + +Modify the region table by specifying which regions are redirected to +which paths. + + + The region number (region size was specified in constructor parameters). + If index is omitted, the next region (previous index + 1) is used. + Expressed in hexadecimal (WITHOUT any prefix like 0x). + + + The path number in the range 0 ... ( - 1). + Expressed in hexadecimal (WITHOUT any prefix like 0x). + +R, + This parameter allows repetitive patterns to be loaded quickly. and + are hexadecimal numbers. The last mappings are repeated in the next + slots. + +Status +====== + +No status line is reported. + +Example +======= + +Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with +the same size. + +Create a switch device with 64kB region size:: + + dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` + switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" + +Set mappings for the first 7 entries to point to devices switch0, switch1, +switch2, switch0, switch1, switch2, switch1:: + + dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 + +Set repetitive mapping. This command:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 + +is equivalent to:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ + :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt deleted file mode 100644 index 5bd4831db4a8..000000000000 --- a/Documentation/device-mapper/switch.txt +++ /dev/null @@ -1,138 +0,0 @@ -dm-switch -========= - -The device-mapper switch target creates a device that supports an -arbitrary mapping of fixed-size regions of I/O across a fixed set of -paths. The path used for any specific region can be switched -dynamically by sending the target a message. - -It maps I/O to underlying block devices efficiently when there is a large -number of fixed-sized address regions but there is no simple pattern -that would allow for a compact representation of the mapping such as -dm-stripe. - -Background ----------- - -Dell EqualLogic and some other iSCSI storage arrays use a distributed -frameless architecture. In this architecture, the storage group -consists of a number of distinct storage arrays ("members") each having -independent controllers, disk storage and network adapters. When a LUN -is created it is spread across multiple members. The details of the -spreading are hidden from initiators connected to this storage system. -The storage group exposes a single target discovery portal, no matter -how many members are being used. When iSCSI sessions are created, each -session is connected to an eth port on a single member. Data to a LUN -can be sent on any iSCSI session, and if the blocks being accessed are -stored on another member the I/O will be forwarded as required. This -forwarding is invisible to the initiator. The storage layout is also -dynamic, and the blocks stored on disk may be moved from member to -member as needed to balance the load. - -This architecture simplifies the management and configuration of both -the storage group and initiators. In a multipathing configuration, it -is possible to set up multiple iSCSI sessions to use multiple network -interfaces on both the host and target to take advantage of the -increased network bandwidth. An initiator could use a simple round -robin algorithm to send I/O across all paths and let the storage array -members forward it as necessary, but there is a performance advantage to -sending data directly to the correct member. - -A device-mapper table already lets you map different regions of a -device onto different targets. However in this architecture the LUN is -spread with an address region size on the order of 10s of MBs, which -means the resulting table could have more than a million entries and -consume far too much memory. - -Using this device-mapper switch target we can now build a two-layer -device hierarchy: - - Upper Tier - Determine which array member the I/O should be sent to. - Lower Tier - Load balance amongst paths to a particular member. - -The lower tier consists of a single dm multipath device for each member. -Each of these multipath devices contains the set of paths directly to -the array member in one priority group, and leverages existing path -selectors to load balance amongst these paths. We also build a -non-preferred priority group containing paths to other array members for -failover reasons. - -The upper tier consists of a single dm-switch device. This device uses -a bitmap to look up the location of the I/O and choose the appropriate -lower tier device to route the I/O. By using a bitmap we are able to -use 4 bits for each address range in a 16 member group (which is very -large for us). This is a much denser representation than the dm table -b-tree can achieve. - -Construction Parameters -======================= - - [...] - [ ]+ - - - The number of paths across which to distribute the I/O. - - - The number of 512-byte sectors in a region. Each region can be redirected - to any of the available paths. - - - The number of optional arguments. Currently, no optional arguments - are supported and so this must be zero. - - - The block device that represents a specific path to the device. - - - The offset of the start of data on the specific (in units - of 512-byte sectors). This number is added to the sector number when - forwarding the request to the specific path. Typically it is zero. - -Messages -======== - -set_region_mappings : []: []:... - -Modify the region table by specifying which regions are redirected to -which paths. - - - The region number (region size was specified in constructor parameters). - If index is omitted, the next region (previous index + 1) is used. - Expressed in hexadecimal (WITHOUT any prefix like 0x). - - - The path number in the range 0 ... ( - 1). - Expressed in hexadecimal (WITHOUT any prefix like 0x). - -R, - This parameter allows repetitive patterns to be loaded quickly. and - are hexadecimal numbers. The last mappings are repeated in the next - slots. - -Status -====== - -No status line is reported. - -Example -======= - -Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with -the same size. - -Create a switch device with 64kB region size: - dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` - switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" - -Set mappings for the first 7 entries to point to devices switch0, switch1, -switch2, switch0, switch1, switch2, switch1: - dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 - -Set repetitive mapping. This command: - dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 -is equivalent to: - dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ - :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 - diff --git a/Documentation/device-mapper/thin-provisioning.rst b/Documentation/device-mapper/thin-provisioning.rst new file mode 100644 index 000000000000..bafebf79da4b --- /dev/null +++ b/Documentation/device-mapper/thin-provisioning.rst @@ -0,0 +1,427 @@ +================= +Thin provisioning +================= + +Introduction +============ + +This document describes a collection of device-mapper targets that +between them implement thin-provisioning and snapshots. + +The main highlight of this implementation, compared to the previous +implementation of snapshots, is that it allows many virtual devices to +be stored on the same data volume. This simplifies administration and +allows the sharing of data between volumes, thus reducing disk usage. + +Another significant feature is support for an arbitrary depth of +recursive snapshots (snapshots of snapshots of snapshots ...). The +previous implementation of snapshots did this by chaining together +lookup tables, and so performance was O(depth). This new +implementation uses a single data structure to avoid this degradation +with depth. Fragmentation may still be an issue, however, in some +scenarios. + +Metadata is stored on a separate device from data, giving the +administrator some freedom, for example to: + +- Improve metadata resilience by storing metadata on a mirrored volume + but data on a non-mirrored one. + +- Improve performance by storing the metadata on SSD. + +Status +====== + +These targets are considered safe for production use. But different use +cases will have different performance characteristics, for example due +to fragmentation of the data volume. + +If you find this software is not performing as expected please mail +dm-devel@redhat.com with details and we'll try our best to improve +things for you. + +Userspace tools for checking and repairing the metadata have been fully +developed and are available as 'thin_check' and 'thin_repair'. The name +of the package that provides these utilities varies by distribution (on +a Red Hat distribution it is named 'device-mapper-persistent-data'). + +Cookbook +======== + +This section describes some quick recipes for using thin provisioning. +They use the dmsetup program to control the device-mapper driver +directly. End users will be advised to use a higher-level volume +manager such as LVM2 once support has been added. + +Pool device +----------- + +The pool device ties together the metadata volume and the data volume. +It maps I/O linearly to the data volume and updates the metadata via +two mechanisms: + +- Function calls from the thin targets + +- Device-mapper 'messages' from userspace which control the creation of new + virtual devices amongst other things. + +Setting up a fresh pool device +------------------------------ + +Setting up a pool device requires a valid metadata device, and a +data device. If you do not have an existing metadata device you can +make one by zeroing the first 4k to indicate empty metadata. + + dd if=/dev/zero of=$metadata_dev bs=4096 count=1 + +The amount of metadata you need will vary according to how many blocks +are shared between thin devices (i.e. through snapshots). If you have +less sharing than average you'll need a larger-than-average metadata device. + +As a guide, we suggest you calculate the number of bytes to use in the +metadata device as 48 * $data_dev_size / $data_block_size but round it up +to 2MB if the answer is smaller. If you're creating large numbers of +snapshots which are recording large amounts of change, you may find you +need to increase this. + +The largest size supported is 16GB: If the device is larger, +a warning will be issued and the excess space will not be used. + +Reloading a pool table +---------------------- + +You may reload a pool's table, indeed this is how the pool is resized +if it runs out of space. (N.B. While specifying a different metadata +device when reloading is not forbidden at the moment, things will go +wrong if it does not route I/O to exactly the same on-disk location as +previously.) + +Using an existing pool device +----------------------------- + +:: + + dmsetup create pool \ + --table "0 20971520 thin-pool $metadata_dev $data_dev \ + $data_block_size $low_water_mark" + +$data_block_size gives the smallest unit of disk space that can be +allocated at a time expressed in units of 512-byte sectors. +$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a +multiple of 128 (64KB). $data_block_size cannot be changed after the +thin-pool is created. People primarily interested in thin provisioning +may want to use a value such as 1024 (512KB). People doing lots of +snapshotting may want a smaller value such as 128 (64KB). If you are +not zeroing newly-allocated data, a larger $data_block_size in the +region of 256000 (128MB) is suggested. + +$low_water_mark is expressed in blocks of size $data_block_size. If +free space on the data device drops below this level then a dm event +will be triggered which a userspace daemon should catch allowing it to +extend the pool device. Only one such event will be sent. + +No special event is triggered if a just resumed device's free space is below +the low water mark. However, resuming a device always triggers an +event; a userspace daemon should verify that free space exceeds the low +water mark when handling this event. + +A low water mark for the metadata device is maintained in the kernel and +will trigger a dm event if free space on the metadata device drops below +it. + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second. This +means the thin-provisioning target behaves like a physical disk that has +a volatile write cache. If power is lost you may lose some recent +writes. The metadata should always be consistent in spite of any crash. + +If data space is exhausted the pool will either error or queue IO +according to the configuration (see: error_if_no_space). If metadata +space is exhausted or a metadata operation fails: the pool will error IO +until the pool is taken offline and repair is performed to 1) fix any +potential inconsistencies and 2) clear the flag that imposes repair. +Once the pool's metadata device is repaired it may be resized, which +will allow the pool to return to normal operation. Note that if a pool +is flagged as needing repair, the pool's data and metadata devices +cannot be resized until repair is performed. It should also be noted +that when the pool's metadata space is exhausted the current metadata +transaction is aborted. Given that the pool will cache IO whose +completion may have already been acknowledged to upper IO layers +(e.g. filesystem) it is strongly suggested that consistency checks +(e.g. fsck) be performed on those layers when repair of the pool is +required. + +Thin provisioning +----------------- + +i) Creating a new thinly-provisioned volume. + + To create a new thinly- provisioned volume you must send a message to an + active pool device, /dev/mapper/pool in this example:: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + + Here '0' is an identifier for the volume, a 24-bit number. It's up + to the caller to allocate and manage these identifiers. If the + identifier is already in use, the message will fail with -EEXIST. + +ii) Using a thinly-provisioned volume. + + Thinly-provisioned volumes are activated using the 'thin' target:: + + dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" + + The last parameter is the identifier for the thinp device. + +Internal snapshots +------------------ + +i) Creating an internal snapshot. + + Snapshots are created with another message to the pool. + + N.B. If the origin device that you wish to snapshot is active, you + must suspend it before creating the snapshot to avoid corruption. + This is NOT enforced at the moment, so please be careful! + + :: + + dmsetup suspend /dev/mapper/thin + dmsetup message /dev/mapper/pool 0 "create_snap 1 0" + dmsetup resume /dev/mapper/thin + + Here '1' is the identifier for the volume, a 24-bit number. '0' is the + identifier for the origin device. + +ii) Using an internal snapshot. + + Once created, the user doesn't have to worry about any connection + between the origin and the snapshot. Indeed the snapshot is no + different from any other thinly-provisioned device and can be + snapshotted itself via the same method. It's perfectly legal to + have only one of them active, and there's no ordering requirement on + activating or removing them both. (This differs from conventional + device-mapper snapshots.) + + Activate it exactly the same way as any other thinly-provisioned volume:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" + +External snapshots +------------------ + +You can use an external **read only** device as an origin for a +thinly-provisioned volume. Any read to an unprovisioned area of the +thin device will be passed through to the origin. Writes trigger +the allocation of new blocks as usual. + +One use case for this is VM hosts that want to run guests on +thinly-provisioned volumes but have the base image on another device +(possibly shared between many VMs). + +You must not write to the origin device if you use this technique! +Of course, you may write to the thin device and take internal snapshots +of the thin volume. + +i) Creating a snapshot of an external device + + This is the same as creating a thin device. + You don't mention the origin at this stage. + + :: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + +ii) Using a snapshot of an external device. + + Append an extra parameter to the thin target specifying the origin:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" + + N.B. All descendants (internal snapshots) of this snapshot require the + same extra origin parameter. + +Deactivation +------------ + +All devices using a pool must be deactivated before the pool itself +can be. + +:: + + dmsetup remove thin + dmsetup remove snap + dmsetup remove pool + +Reference +========= + +'thin-pool' target +------------------ + +i) Constructor + + :: + + thin-pool \ + [ []*] + + Optional feature arguments: + + skip_block_zeroing: + Skip the zeroing of newly-provisioned blocks. + + ignore_discard: + Disable discard support. + + no_discard_passdown: + Don't pass discards down to the underlying + data device, but just remove the mapping. + + read_only: + Don't allow any changes to be made to the pool + metadata. This mode is only available after the + thin-pool has been created and first used in full + read/write mode. It cannot be specified on initial + thin-pool creation. + + error_if_no_space: + Error IOs, instead of queueing, if no space. + + Data block size must be between 64KB (128 sectors) and 1GB + (2097152 sectors) inclusive. + + +ii) Status + + :: + + / + / + ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space + needs_check|- metadata_low_watermark + + transaction id: + A 64-bit number used by userspace to help synchronise with metadata + from volume managers. + + used data blocks / total data blocks + If the number of free blocks drops below the pool's low water mark a + dm event will be sent to userspace. This event is edge-triggered and + it will occur only once after each resume so volume manager writers + should register for the event and then check the target's status. + + held metadata root: + The location, in blocks, of the metadata root that has been + 'held' for userspace read access. '-' indicates there is no + held root. + + discard_passdown|no_discard_passdown + Whether or not discards are actually being passed down to the + underlying device. When this is enabled when loading the table, + it can get disabled if the underlying device doesn't support it. + + ro|rw|out_of_data_space + If the pool encounters certain types of device failures it will + drop into a read-only metadata mode in which no changes to + the pool metadata (like allocating new blocks) are permitted. + + In serious cases where even a read-only mode is deemed unsafe + no further I/O will be permitted and the status will just + contain the string 'Fail'. The userspace recovery tools + should then be used. + + error_if_no_space|queue_if_no_space + If the pool runs out of data or metadata space, the pool will + either queue or error the IO destined to the data device. The + default is to queue the IO until more space is added or the + 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool + module parameter can be used to change this timeout -- it + defaults to 60 seconds but may be disabled using a value of 0. + + needs_check + A metadata operation has failed, resulting in the needs_check + flag being set in the metadata's superblock. The metadata + device must be deactivated and checked/repaired before the + thin-pool can be made fully operational again. '-' indicates + needs_check is not set. + + metadata_low_watermark: + Value of metadata low watermark in blocks. The kernel sets this + value internally but userspace needs to know this value to + determine if an event was caused by crossing this threshold. + +iii) Messages + + create_thin + Create a new thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + + create_snap + Create a new snapshot of another thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + is the identifier of the thinly-provisioned device + of which the new device will be a snapshot. + + delete + Deletes a thin device. Irreversible. + + set_transaction_id + Userland volume managers, such as LVM, need a way to + synchronise their external metadata with the internal metadata of the + pool target. The thin-pool target offers to store an + arbitrary 64-bit transaction id and return it on the target's + status line. To avoid races you must provide what you think + the current transaction id is when you change it with this + compare-and-swap message. + + reserve_metadata_snap + Reserve a copy of the data mapping btree for use by userland. + This allows userland to inspect the mappings as they were when + this message was executed. Use the pool's status command to + get the root block associated with the metadata snapshot. + + release_metadata_snap + Release a previously reserved copy of the data mapping btree. + +'thin' target +------------- + +i) Constructor + + :: + + thin [] + + pool dev: + the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 + + dev id: + the internal device identifier of the device to be + activated. + + external origin dev: + an optional block device outside the pool to be treated as a + read-only snapshot origin: reads to unprovisioned areas of the + thin target will be mapped to this device. + +The pool doesn't store any size against the thin devices. If you +load a thin target that is smaller than you've been using previously, +then you'll have no access to blocks mapped beyond the end. If you +load a target that is bigger than before, then extra blocks will be +provisioned as and when needed. + +ii) Status + + + If the pool has encountered device errors and failed, the status + will just contain the string 'Fail'. The userspace recovery + tools should then be used. + + In the case where is 0, there is no highest + mapped sector and the value of is unspecified. diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt deleted file mode 100644 index 883e7ca5f745..000000000000 --- a/Documentation/device-mapper/thin-provisioning.txt +++ /dev/null @@ -1,411 +0,0 @@ -Introduction -============ - -This document describes a collection of device-mapper targets that -between them implement thin-provisioning and snapshots. - -The main highlight of this implementation, compared to the previous -implementation of snapshots, is that it allows many virtual devices to -be stored on the same data volume. This simplifies administration and -allows the sharing of data between volumes, thus reducing disk usage. - -Another significant feature is support for an arbitrary depth of -recursive snapshots (snapshots of snapshots of snapshots ...). The -previous implementation of snapshots did this by chaining together -lookup tables, and so performance was O(depth). This new -implementation uses a single data structure to avoid this degradation -with depth. Fragmentation may still be an issue, however, in some -scenarios. - -Metadata is stored on a separate device from data, giving the -administrator some freedom, for example to: - -- Improve metadata resilience by storing metadata on a mirrored volume - but data on a non-mirrored one. - -- Improve performance by storing the metadata on SSD. - -Status -====== - -These targets are considered safe for production use. But different use -cases will have different performance characteristics, for example due -to fragmentation of the data volume. - -If you find this software is not performing as expected please mail -dm-devel@redhat.com with details and we'll try our best to improve -things for you. - -Userspace tools for checking and repairing the metadata have been fully -developed and are available as 'thin_check' and 'thin_repair'. The name -of the package that provides these utilities varies by distribution (on -a Red Hat distribution it is named 'device-mapper-persistent-data'). - -Cookbook -======== - -This section describes some quick recipes for using thin provisioning. -They use the dmsetup program to control the device-mapper driver -directly. End users will be advised to use a higher-level volume -manager such as LVM2 once support has been added. - -Pool device ------------ - -The pool device ties together the metadata volume and the data volume. -It maps I/O linearly to the data volume and updates the metadata via -two mechanisms: - -- Function calls from the thin targets - -- Device-mapper 'messages' from userspace which control the creation of new - virtual devices amongst other things. - -Setting up a fresh pool device ------------------------------- - -Setting up a pool device requires a valid metadata device, and a -data device. If you do not have an existing metadata device you can -make one by zeroing the first 4k to indicate empty metadata. - - dd if=/dev/zero of=$metadata_dev bs=4096 count=1 - -The amount of metadata you need will vary according to how many blocks -are shared between thin devices (i.e. through snapshots). If you have -less sharing than average you'll need a larger-than-average metadata device. - -As a guide, we suggest you calculate the number of bytes to use in the -metadata device as 48 * $data_dev_size / $data_block_size but round it up -to 2MB if the answer is smaller. If you're creating large numbers of -snapshots which are recording large amounts of change, you may find you -need to increase this. - -The largest size supported is 16GB: If the device is larger, -a warning will be issued and the excess space will not be used. - -Reloading a pool table ----------------------- - -You may reload a pool's table, indeed this is how the pool is resized -if it runs out of space. (N.B. While specifying a different metadata -device when reloading is not forbidden at the moment, things will go -wrong if it does not route I/O to exactly the same on-disk location as -previously.) - -Using an existing pool device ------------------------------ - - dmsetup create pool \ - --table "0 20971520 thin-pool $metadata_dev $data_dev \ - $data_block_size $low_water_mark" - -$data_block_size gives the smallest unit of disk space that can be -allocated at a time expressed in units of 512-byte sectors. -$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a -multiple of 128 (64KB). $data_block_size cannot be changed after the -thin-pool is created. People primarily interested in thin provisioning -may want to use a value such as 1024 (512KB). People doing lots of -snapshotting may want a smaller value such as 128 (64KB). If you are -not zeroing newly-allocated data, a larger $data_block_size in the -region of 256000 (128MB) is suggested. - -$low_water_mark is expressed in blocks of size $data_block_size. If -free space on the data device drops below this level then a dm event -will be triggered which a userspace daemon should catch allowing it to -extend the pool device. Only one such event will be sent. - -No special event is triggered if a just resumed device's free space is below -the low water mark. However, resuming a device always triggers an -event; a userspace daemon should verify that free space exceeds the low -water mark when handling this event. - -A low water mark for the metadata device is maintained in the kernel and -will trigger a dm event if free space on the metadata device drops below -it. - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the thin-provisioning target behaves like a physical disk that has -a volatile write cache. If power is lost you may lose some recent -writes. The metadata should always be consistent in spite of any crash. - -If data space is exhausted the pool will either error or queue IO -according to the configuration (see: error_if_no_space). If metadata -space is exhausted or a metadata operation fails: the pool will error IO -until the pool is taken offline and repair is performed to 1) fix any -potential inconsistencies and 2) clear the flag that imposes repair. -Once the pool's metadata device is repaired it may be resized, which -will allow the pool to return to normal operation. Note that if a pool -is flagged as needing repair, the pool's data and metadata devices -cannot be resized until repair is performed. It should also be noted -that when the pool's metadata space is exhausted the current metadata -transaction is aborted. Given that the pool will cache IO whose -completion may have already been acknowledged to upper IO layers -(e.g. filesystem) it is strongly suggested that consistency checks -(e.g. fsck) be performed on those layers when repair of the pool is -required. - -Thin provisioning ------------------ - -i) Creating a new thinly-provisioned volume. - - To create a new thinly- provisioned volume you must send a message to an - active pool device, /dev/mapper/pool in this example. - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - - Here '0' is an identifier for the volume, a 24-bit number. It's up - to the caller to allocate and manage these identifiers. If the - identifier is already in use, the message will fail with -EEXIST. - -ii) Using a thinly-provisioned volume. - - Thinly-provisioned volumes are activated using the 'thin' target: - - dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" - - The last parameter is the identifier for the thinp device. - -Internal snapshots ------------------- - -i) Creating an internal snapshot. - - Snapshots are created with another message to the pool. - - N.B. If the origin device that you wish to snapshot is active, you - must suspend it before creating the snapshot to avoid corruption. - This is NOT enforced at the moment, so please be careful! - - dmsetup suspend /dev/mapper/thin - dmsetup message /dev/mapper/pool 0 "create_snap 1 0" - dmsetup resume /dev/mapper/thin - - Here '1' is the identifier for the volume, a 24-bit number. '0' is the - identifier for the origin device. - -ii) Using an internal snapshot. - - Once created, the user doesn't have to worry about any connection - between the origin and the snapshot. Indeed the snapshot is no - different from any other thinly-provisioned device and can be - snapshotted itself via the same method. It's perfectly legal to - have only one of them active, and there's no ordering requirement on - activating or removing them both. (This differs from conventional - device-mapper snapshots.) - - Activate it exactly the same way as any other thinly-provisioned volume: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" - -External snapshots ------------------- - -You can use an external _read only_ device as an origin for a -thinly-provisioned volume. Any read to an unprovisioned area of the -thin device will be passed through to the origin. Writes trigger -the allocation of new blocks as usual. - -One use case for this is VM hosts that want to run guests on -thinly-provisioned volumes but have the base image on another device -(possibly shared between many VMs). - -You must not write to the origin device if you use this technique! -Of course, you may write to the thin device and take internal snapshots -of the thin volume. - -i) Creating a snapshot of an external device - - This is the same as creating a thin device. - You don't mention the origin at this stage. - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - -ii) Using a snapshot of an external device. - - Append an extra parameter to the thin target specifying the origin: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" - - N.B. All descendants (internal snapshots) of this snapshot require the - same extra origin parameter. - -Deactivation ------------- - -All devices using a pool must be deactivated before the pool itself -can be. - - dmsetup remove thin - dmsetup remove snap - dmsetup remove pool - -Reference -========= - -'thin-pool' target ------------------- - -i) Constructor - - thin-pool \ - [ []*] - - Optional feature arguments: - - skip_block_zeroing: Skip the zeroing of newly-provisioned blocks. - - ignore_discard: Disable discard support. - - no_discard_passdown: Don't pass discards down to the underlying - data device, but just remove the mapping. - - read_only: Don't allow any changes to be made to the pool - metadata. This mode is only available after the - thin-pool has been created and first used in full - read/write mode. It cannot be specified on initial - thin-pool creation. - - error_if_no_space: Error IOs, instead of queueing, if no space. - - Data block size must be between 64KB (128 sectors) and 1GB - (2097152 sectors) inclusive. - - -ii) Status - - / - / - ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space - needs_check|- metadata_low_watermark - - transaction id: - A 64-bit number used by userspace to help synchronise with metadata - from volume managers. - - used data blocks / total data blocks - If the number of free blocks drops below the pool's low water mark a - dm event will be sent to userspace. This event is edge-triggered and - it will occur only once after each resume so volume manager writers - should register for the event and then check the target's status. - - held metadata root: - The location, in blocks, of the metadata root that has been - 'held' for userspace read access. '-' indicates there is no - held root. - - discard_passdown|no_discard_passdown - Whether or not discards are actually being passed down to the - underlying device. When this is enabled when loading the table, - it can get disabled if the underlying device doesn't support it. - - ro|rw|out_of_data_space - If the pool encounters certain types of device failures it will - drop into a read-only metadata mode in which no changes to - the pool metadata (like allocating new blocks) are permitted. - - In serious cases where even a read-only mode is deemed unsafe - no further I/O will be permitted and the status will just - contain the string 'Fail'. The userspace recovery tools - should then be used. - - error_if_no_space|queue_if_no_space - If the pool runs out of data or metadata space, the pool will - either queue or error the IO destined to the data device. The - default is to queue the IO until more space is added or the - 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool - module parameter can be used to change this timeout -- it - defaults to 60 seconds but may be disabled using a value of 0. - - needs_check - A metadata operation has failed, resulting in the needs_check - flag being set in the metadata's superblock. The metadata - device must be deactivated and checked/repaired before the - thin-pool can be made fully operational again. '-' indicates - needs_check is not set. - - metadata_low_watermark: - Value of metadata low watermark in blocks. The kernel sets this - value internally but userspace needs to know this value to - determine if an event was caused by crossing this threshold. - -iii) Messages - - create_thin - - Create a new thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - - create_snap - - Create a new snapshot of another thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - is the identifier of the thinly-provisioned device - of which the new device will be a snapshot. - - delete - - Deletes a thin device. Irreversible. - - set_transaction_id - - Userland volume managers, such as LVM, need a way to - synchronise their external metadata with the internal metadata of the - pool target. The thin-pool target offers to store an - arbitrary 64-bit transaction id and return it on the target's - status line. To avoid races you must provide what you think - the current transaction id is when you change it with this - compare-and-swap message. - - reserve_metadata_snap - - Reserve a copy of the data mapping btree for use by userland. - This allows userland to inspect the mappings as they were when - this message was executed. Use the pool's status command to - get the root block associated with the metadata snapshot. - - release_metadata_snap - - Release a previously reserved copy of the data mapping btree. - -'thin' target -------------- - -i) Constructor - - thin [] - - pool dev: - the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 - - dev id: - the internal device identifier of the device to be - activated. - - external origin dev: - an optional block device outside the pool to be treated as a - read-only snapshot origin: reads to unprovisioned areas of the - thin target will be mapped to this device. - -The pool doesn't store any size against the thin devices. If you -load a thin target that is smaller than you've been using previously, -then you'll have no access to blocks mapped beyond the end. If you -load a target that is bigger than before, then extra blocks will be -provisioned as and when needed. - -ii) Status - - - - If the pool has encountered device errors and failed, the status - will just contain the string 'Fail'. The userspace recovery - tools should then be used. - - In the case where is 0, there is no highest - mapped sector and the value of is unspecified. diff --git a/Documentation/device-mapper/unstriped.rst b/Documentation/device-mapper/unstriped.rst new file mode 100644 index 000000000000..0a8d3eb3f072 --- /dev/null +++ b/Documentation/device-mapper/unstriped.rst @@ -0,0 +1,135 @@ +================================ +Device-mapper "unstriped" target +================================ + +Introduction +============ + +The device-mapper "unstriped" target provides a transparent mechanism to +unstripe a device-mapper "striped" target to access the underlying disks +without having to touch the true backing block-device. It can also be +used to unstripe a hardware RAID-0 to access backing disks. + +Parameters: + + + + The number of stripes in the RAID 0. + + + The amount of 512B sectors in the chunk striping. + + + The block device you wish to unstripe. + + + The stripe number within the device that corresponds to physical + drive you wish to unstripe. This must be 0 indexed. + + +Why use this module? +==================== + +An example of undoing an existing dm-stripe +------------------------------------------- + +This small bash script will setup 4 loop devices and use the existing +striped target to combine the 4 devices into one. It then will use +the unstriped target ontop of the striped device to access the +individual backing loop devices. We write data to the newly exposed +unstriped devices and verify the data written matches the correct +underlying device on the striped array:: + + #!/bin/bash + + MEMBER_SIZE=$((128 * 1024 * 1024)) + NUM=4 + SEQ_END=$((${NUM}-1)) + CHUNK=256 + BS=4096 + + RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) + DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" + COUNT=$((${MEMBER_SIZE} / ${BS})) + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct + losetup /dev/loop${i} member-${i} + DM_PARMS+=" /dev/loop${i} 0" + done + + echo $DM_PARMS | dmsetup create raid0 + for i in $(seq 0 ${SEQ_END}); do + echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct + diff /dev/mapper/set-${i} member-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dmsetup remove set-${i} + done + + dmsetup remove raid0 + + for i in $(seq 0 ${SEQ_END}); do + losetup -d /dev/loop${i} + rm -f member-${i} + done + +Another example +--------------- + +Intel NVMe drives contain two cores on the physical device. +Each core of the drive has segregated access to its LBA range. +The current LBA model has a RAID 0 128k chunk on each core, resulting +in a 256k stripe across the two cores:: + + Core 0: Core 1: + __________ __________ + | LBA 512| | LBA 768| + | LBA 0 | | LBA 256| + ---------- ---------- + +The purpose of this unstriping is to provide better QoS in noisy +neighbor environments. When two partitions are created on the +aggregate drive without this unstriping, reads on one partition +can affect writes on another partition. This is because the partitions +are striped across the two cores. When we unstripe this hardware RAID 0 +and make partitions on each new exposed device the two partitions are now +physically separated. + +With the dm-unstriped target we're able to segregate an fio script that +has read and write jobs that are independent of each other. Compared to +when we run the test on a combined drive with partitions, we were able +to get a 92% reduction in read latency using this device mapper target. + + +Example dmsetup usage +===================== + +unstriped ontop of Intel NVMe device that has 2 cores +----------------------------------------------------- + +:: + + dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' + dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' + +There will now be two devices that expose Intel NVMe core 0 and 1 +respectively:: + + /dev/mapper/nvmset0 + /dev/mapper/nvmset1 + +unstriped ontop of striped with 4 drives using 128K chunk size +-------------------------------------------------------------- + +:: + + dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' + dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' + dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' + dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/device-mapper/unstriped.txt b/Documentation/device-mapper/unstriped.txt deleted file mode 100644 index 0b2a306c54ee..000000000000 --- a/Documentation/device-mapper/unstriped.txt +++ /dev/null @@ -1,124 +0,0 @@ -Introduction -============ - -The device-mapper "unstriped" target provides a transparent mechanism to -unstripe a device-mapper "striped" target to access the underlying disks -without having to touch the true backing block-device. It can also be -used to unstripe a hardware RAID-0 to access backing disks. - -Parameters: - - - - The number of stripes in the RAID 0. - - - The amount of 512B sectors in the chunk striping. - - - The block device you wish to unstripe. - - - The stripe number within the device that corresponds to physical - drive you wish to unstripe. This must be 0 indexed. - - -Why use this module? -==================== - -An example of undoing an existing dm-stripe -------------------------------------------- - -This small bash script will setup 4 loop devices and use the existing -striped target to combine the 4 devices into one. It then will use -the unstriped target ontop of the striped device to access the -individual backing loop devices. We write data to the newly exposed -unstriped devices and verify the data written matches the correct -underlying device on the striped array. - -#!/bin/bash - -MEMBER_SIZE=$((128 * 1024 * 1024)) -NUM=4 -SEQ_END=$((${NUM}-1)) -CHUNK=256 -BS=4096 - -RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) -DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" -COUNT=$((${MEMBER_SIZE} / ${BS})) - -for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct - losetup /dev/loop${i} member-${i} - DM_PARMS+=" /dev/loop${i} 0" -done - -echo $DM_PARMS | dmsetup create raid0 -for i in $(seq 0 ${SEQ_END}); do - echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} -done; - -for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct - diff /dev/mapper/set-${i} member-${i} -done; - -for i in $(seq 0 ${SEQ_END}); do - dmsetup remove set-${i} -done - -dmsetup remove raid0 - -for i in $(seq 0 ${SEQ_END}); do - losetup -d /dev/loop${i} - rm -f member-${i} -done - -Another example ---------------- - -Intel NVMe drives contain two cores on the physical device. -Each core of the drive has segregated access to its LBA range. -The current LBA model has a RAID 0 128k chunk on each core, resulting -in a 256k stripe across the two cores: - - Core 0: Core 1: - __________ __________ - | LBA 512| | LBA 768| - | LBA 0 | | LBA 256| - ---------- ---------- - -The purpose of this unstriping is to provide better QoS in noisy -neighbor environments. When two partitions are created on the -aggregate drive without this unstriping, reads on one partition -can affect writes on another partition. This is because the partitions -are striped across the two cores. When we unstripe this hardware RAID 0 -and make partitions on each new exposed device the two partitions are now -physically separated. - -With the dm-unstriped target we're able to segregate an fio script that -has read and write jobs that are independent of each other. Compared to -when we run the test on a combined drive with partitions, we were able -to get a 92% reduction in read latency using this device mapper target. - - -Example dmsetup usage -===================== - -unstriped ontop of Intel NVMe device that has 2 cores ------------------------------------------------------ -dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' -dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' - -There will now be two devices that expose Intel NVMe core 0 and 1 -respectively: -/dev/mapper/nvmset0 -/dev/mapper/nvmset1 - -unstriped ontop of striped with 4 drives using 128K chunk size --------------------------------------------------------------- -dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' -dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' -dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' -dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/device-mapper/verity.rst b/Documentation/device-mapper/verity.rst new file mode 100644 index 000000000000..a4d1c1476d72 --- /dev/null +++ b/Documentation/device-mapper/verity.rst @@ -0,0 +1,229 @@ +========= +dm-verity +========= + +Device-Mapper's "verity" target provides transparent integrity checking of +block devices using a cryptographic digest provided by the kernel crypto API. +This target is read-only. + +Construction Parameters +======================= + +:: + + + + + + [<#opt_params> ] + + + This is the type of the on-disk hash format. + + 0 is the original format used in the Chromium OS. + The salt is appended when hashing, digests are stored continuously and + the rest of the block is padded with zeroes. + + 1 is the current format that should be used for new devices. + The salt is prepended when hashing and each digest is + padded with zeroes to the power of two. + + + This is the device containing data, the integrity of which needs to be + checked. It may be specified as a path, like /dev/sdaX, or a device number, + :. + + + This is the device that supplies the hash tree data. It may be + specified similarly to the device path and may be the same device. If the + same device is used, the hash_start should be outside the configured + dm-verity device. + + + The block size on a data device in bytes. + Each block corresponds to one digest on the hash device. + + + The size of a hash block in bytes. + + + The number of data blocks on the data device. Additional blocks are + inaccessible. You can place hashes to the same partition as data, in this + case hashes are placed after . + + + This is the offset, in -blocks, from the start of hash_dev + to the root block of the hash tree. + + + The cryptographic hash algorithm used for this device. This should + be the name of the algorithm, like "sha1". + + + The hexadecimal encoding of the cryptographic hash of the root hash block + and the salt. This hash should be trusted as there is no other authenticity + beyond this point. + + + The hexadecimal encoding of the salt value. + +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 ignore_corruption + +ignore_corruption + Log corrupted blocks, but allow read operations to proceed normally. + +restart_on_corruption + Restart the system when a corrupted block is discovered. This option is + not compatible with ignore_corruption and requires user space support to + avoid restart loops. + +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + +check_at_most_once + Verify data blocks only the first time they are read from the data device, + rather than every time. This reduces the overhead of dm-verity so that it + can be used on systems that are memory and/or CPU constrained. However, it + provides a reduced level of security because only offline tampering of the + data device's content will be detected, not online tampering. + + Hash blocks are still verified each time they are read from the hash device, + since verification of hash blocks is less performance critical than data + blocks, and a hash block will not be verified any more after all the data + blocks it covers have been verified anyway. + +Theory of operation +=================== + +dm-verity is meant to be set up as part of a verified boot path. This +may be anything ranging from a boot using tboot or trustedgrub to just +booting from a known-good device (like a USB drive or CD). + +When a dm-verity device is configured, it is expected that the caller +has been authenticated in some way (cryptographic signatures, etc). +After instantiation, all hashes will be verified on-demand during +disk access. If they cannot be verified up to the root node of the +tree, the root hash, then the I/O will fail. This should detect +tampering with any data on the device and the hash data. + +Cryptographic hashes are used to assert the integrity of the device on a +per-block basis. This allows for a lightweight hash computation on first read +into the page cache. Block hashes are stored linearly, aligned to the nearest +block size. + +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + +Hash Tree +--------- + +Each node in the tree is a cryptographic hash. If it is a leaf node, the hash +of some data block on disk is calculated. If it is an intermediary node, +the hash of a number of child nodes is calculated. + +Each entry in the tree is a collection of neighboring nodes that fit in one +block. The number is determined based on block_size and the size of the +selected cryptographic digest algorithm. The hashes are linearly-ordered in +this entry and any unaligned trailing space is ignored but included when +calculating the parent node. + +The tree looks something like: + + alg = sha256, num_blocks = 32768, block_size = 4096 + +:: + + [ root ] + / . . . \ + [entry_0] [entry_1] + / . . . \ . . . \ + [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] + / ... \ / . . . \ / \ + blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 + + +On-disk format +============== + +The verity kernel code does not read the verity metadata on-disk header. +It only reads the hash blocks which directly follow the header. +It is expected that a user-space tool will verify the integrity of the +verity header. + +Alternatively, the header can be omitted and the dmsetup parameters can +be passed via the kernel command-line in a rooted chain of trust where +the command-line is verified. + +Directly following the header (and with sector number padded to the next hash +block boundary) are the hash blocks which are stored a depth at a time +(starting from the root), sorted in order of increasing index. + +The full specification of kernel parameters and on-disk metadata format +is available at the cryptsetup project's wiki page + + https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity + +Status +====== +V (for Valid) is returned if every check performed so far was valid. +If any check failed, C (for Corruption) is returned. + +Example +======= +Set up a device:: + + # dmsetup create vroot --readonly --table \ + "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ + "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ + "1234000000000000000000000000000000000000000000000000000000000000" + +A command line tool veritysetup is available to compute or verify +the hash tree or activate the kernel device. This is available from +the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ +(as a libcryptsetup extension). + +Create hash on the device:: + + # veritysetup format /dev/sda1 /dev/sda2 + ... + Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 + +Activate the device:: + + # veritysetup create vroot /dev/sda1 /dev/sda2 \ + 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt deleted file mode 100644 index b3d2e4a42255..000000000000 --- a/Documentation/device-mapper/verity.txt +++ /dev/null @@ -1,219 +0,0 @@ -dm-verity -========== - -Device-Mapper's "verity" target provides transparent integrity checking of -block devices using a cryptographic digest provided by the kernel crypto API. -This target is read-only. - -Construction Parameters -======================= - - - - - [<#opt_params> ] - - - This is the type of the on-disk hash format. - - 0 is the original format used in the Chromium OS. - The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeroes. - - 1 is the current format that should be used for new devices. - The salt is prepended when hashing and each digest is - padded with zeroes to the power of two. - - - This is the device containing data, the integrity of which needs to be - checked. It may be specified as a path, like /dev/sdaX, or a device number, - :. - - - This is the device that supplies the hash tree data. It may be - specified similarly to the device path and may be the same device. If the - same device is used, the hash_start should be outside the configured - dm-verity device. - - - The block size on a data device in bytes. - Each block corresponds to one digest on the hash device. - - - The size of a hash block in bytes. - - - The number of data blocks on the data device. Additional blocks are - inaccessible. You can place hashes to the same partition as data, in this - case hashes are placed after . - - - This is the offset, in -blocks, from the start of hash_dev - to the root block of the hash tree. - - - The cryptographic hash algorithm used for this device. This should - be the name of the algorithm, like "sha1". - - - The hexadecimal encoding of the cryptographic hash of the root hash block - and the salt. This hash should be trusted as there is no other authenticity - beyond this point. - - - The hexadecimal encoding of the salt value. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 1 ignore_corruption - -ignore_corruption - Log corrupted blocks, but allow read operations to proceed normally. - -restart_on_corruption - Restart the system when a corrupted block is discovered. This option is - not compatible with ignore_corruption and requires user space support to - avoid restart loops. - -ignore_zero_blocks - Do not verify blocks that are expected to contain zeroes and always return - zeroes instead. This may be useful if the partition contains unused blocks - that are not guaranteed to contain zeroes. - -use_fec_from_device - Use forward error correction (FEC) to recover from corruption if hash - verification fails. Use encoding data from the specified device. This - may be the same device where data and hash blocks reside, in which case - fec_start must be outside data and hash areas. - - If the encoding data covers additional metadata, it must be accessible - on the hash device after the hash blocks. - - Note: block sizes for data and hash devices must match. Also, if the - verity is encrypted the should be too. - -fec_roots - Number of generator roots. This equals to the number of parity bytes in - the encoding data. For example, in RS(M, N) encoding, the number of roots - is M-N. - -fec_blocks - The number of encoding data blocks on the FEC device. The block size for - the FEC device is . - -fec_start - This is the offset, in blocks, from the start of the - FEC device to the beginning of the encoding data. - -check_at_most_once - Verify data blocks only the first time they are read from the data device, - rather than every time. This reduces the overhead of dm-verity so that it - can be used on systems that are memory and/or CPU constrained. However, it - provides a reduced level of security because only offline tampering of the - data device's content will be detected, not online tampering. - - Hash blocks are still verified each time they are read from the hash device, - since verification of hash blocks is less performance critical than data - blocks, and a hash block will not be verified any more after all the data - blocks it covers have been verified anyway. - -Theory of operation -=================== - -dm-verity is meant to be set up as part of a verified boot path. This -may be anything ranging from a boot using tboot or trustedgrub to just -booting from a known-good device (like a USB drive or CD). - -When a dm-verity device is configured, it is expected that the caller -has been authenticated in some way (cryptographic signatures, etc). -After instantiation, all hashes will be verified on-demand during -disk access. If they cannot be verified up to the root node of the -tree, the root hash, then the I/O will fail. This should detect -tampering with any data on the device and the hash data. - -Cryptographic hashes are used to assert the integrity of the device on a -per-block basis. This allows for a lightweight hash computation on first read -into the page cache. Block hashes are stored linearly, aligned to the nearest -block size. - -If forward error correction (FEC) support is enabled any recovery of -corrupted data will be verified using the cryptographic hash of the -corresponding data. This is why combining error correction with -integrity checking is essential. - -Hash Tree ---------- - -Each node in the tree is a cryptographic hash. If it is a leaf node, the hash -of some data block on disk is calculated. If it is an intermediary node, -the hash of a number of child nodes is calculated. - -Each entry in the tree is a collection of neighboring nodes that fit in one -block. The number is determined based on block_size and the size of the -selected cryptographic digest algorithm. The hashes are linearly-ordered in -this entry and any unaligned trailing space is ignored but included when -calculating the parent node. - -The tree looks something like: - -alg = sha256, num_blocks = 32768, block_size = 4096 - - [ root ] - / . . . \ - [entry_0] [entry_1] - / . . . \ . . . \ - [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] - / ... \ / . . . \ / \ - blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 - - -On-disk format -============== - -The verity kernel code does not read the verity metadata on-disk header. -It only reads the hash blocks which directly follow the header. -It is expected that a user-space tool will verify the integrity of the -verity header. - -Alternatively, the header can be omitted and the dmsetup parameters can -be passed via the kernel command-line in a rooted chain of trust where -the command-line is verified. - -Directly following the header (and with sector number padded to the next hash -block boundary) are the hash blocks which are stored a depth at a time -(starting from the root), sorted in order of increasing index. - -The full specification of kernel parameters and on-disk metadata format -is available at the cryptsetup project's wiki page - https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity - -Status -====== -V (for Valid) is returned if every check performed so far was valid. -If any check failed, C (for Corruption) is returned. - -Example -======= -Set up a device: - # dmsetup create vroot --readonly --table \ - "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ - "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ - "1234000000000000000000000000000000000000000000000000000000000000" - -A command line tool veritysetup is available to compute or verify -the hash tree or activate the kernel device. This is available from -the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ -(as a libcryptsetup extension). - -Create hash on the device: - # veritysetup format /dev/sda1 /dev/sda2 - ... - Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 - -Activate the device: - # veritysetup create vroot /dev/sda1 /dev/sda2 \ - 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/device-mapper/writecache.rst b/Documentation/device-mapper/writecache.rst new file mode 100644 index 000000000000..d3d7690f5e8d --- /dev/null +++ b/Documentation/device-mapper/writecache.rst @@ -0,0 +1,79 @@ +================= +Writecache target +================= + +The writecache target caches writes on persistent memory or on SSD. It +doesn't cache reads because reads are supposed to be cached in page cache +in normal RAM. + +When the device is constructed, the first sector should be zeroed or the +first sector should contain valid superblock from previous invocation. + +Constructor parameters: + +1. type of the cache device - "p" or "s" + + - p - persistent memory + - s - SSD +2. the underlying device that will be cached +3. the cache device +4. block size (4096 is recommended; the maximum block size is the page + size) +5. the number of optional parameters (the parameters with an argument + count as two) + + start_sector n (default: 0) + offset from the start of cache device in 512-byte sectors + high_watermark n (default: 50) + start writeback when the number of used blocks reach this + watermark + low_watermark x (default: 45) + stop writeback when the number of used blocks drops below + this watermark + writeback_jobs n (default: unlimited) + limit the number of blocks that are in flight during + writeback. Setting this value reduces writeback + throughput, but it may improve latency of read requests + autocommit_blocks n (default: 64 for pmem, 65536 for ssd) + when the application writes this amount of blocks without + issuing the FLUSH request, the blocks are automatically + commited + autocommit_time ms (default: 1000) + autocommit time in milliseconds. The data is automatically + commited if this time passes and no FLUSH request is + received + fua (by default on) + applicable only to persistent memory - use the FUA flag + when writing data from persistent memory back to the + underlying device + nofua + applicable only to persistent memory - don't use the FUA + flag when writing back data and send the FLUSH request + afterwards + + - some underlying devices perform better with fua, some + with nofua. The user should test it + +Status: +1. error indicator - 0 if there was no error, otherwise error number +2. the number of blocks +3. the number of free blocks +4. the number of blocks under writeback + +Messages: + flush + flush the cache device. The message returns successfully + if the cache device was flushed without an error + flush_on_suspend + flush the cache device on next suspend. Use this message + when you are going to remove the cache device. The proper + sequence for removing the cache device is: + + 1. send the "flush_on_suspend" message + 2. load an inactive table with a linear target that maps + to the underlying device + 3. suspend the device + 4. ask for status and verify that there are no errors + 5. resume the device, so that it will use the linear + target + 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.txt deleted file mode 100644 index 01532b3008ae..000000000000 --- a/Documentation/device-mapper/writecache.txt +++ /dev/null @@ -1,70 +0,0 @@ -The writecache target caches writes on persistent memory or on SSD. It -doesn't cache reads because reads are supposed to be cached in page cache -in normal RAM. - -When the device is constructed, the first sector should be zeroed or the -first sector should contain valid superblock from previous invocation. - -Constructor parameters: -1. type of the cache device - "p" or "s" - p - persistent memory - s - SSD -2. the underlying device that will be cached -3. the cache device -4. block size (4096 is recommended; the maximum block size is the page - size) -5. the number of optional parameters (the parameters with an argument - count as two) - start_sector n (default: 0) - offset from the start of cache device in 512-byte sectors - high_watermark n (default: 50) - start writeback when the number of used blocks reach this - watermark - low_watermark x (default: 45) - stop writeback when the number of used blocks drops below - this watermark - writeback_jobs n (default: unlimited) - limit the number of blocks that are in flight during - writeback. Setting this value reduces writeback - throughput, but it may improve latency of read requests - autocommit_blocks n (default: 64 for pmem, 65536 for ssd) - when the application writes this amount of blocks without - issuing the FLUSH request, the blocks are automatically - commited - autocommit_time ms (default: 1000) - autocommit time in milliseconds. The data is automatically - commited if this time passes and no FLUSH request is - received - fua (by default on) - applicable only to persistent memory - use the FUA flag - when writing data from persistent memory back to the - underlying device - nofua - applicable only to persistent memory - don't use the FUA - flag when writing back data and send the FLUSH request - afterwards - - some underlying devices perform better with fua, some - with nofua. The user should test it - -Status: -1. error indicator - 0 if there was no error, otherwise error number -2. the number of blocks -3. the number of free blocks -4. the number of blocks under writeback - -Messages: - flush - flush the cache device. The message returns successfully - if the cache device was flushed without an error - flush_on_suspend - flush the cache device on next suspend. Use this message - when you are going to remove the cache device. The proper - sequence for removing the cache device is: - 1. send the "flush_on_suspend" message - 2. load an inactive table with a linear target that maps - to the underlying device - 3. suspend the device - 4. ask for status and verify that there are no errors - 5. resume the device, so that it will use the linear - target - 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/device-mapper/zero.rst b/Documentation/device-mapper/zero.rst new file mode 100644 index 000000000000..11fb5cf4597c --- /dev/null +++ b/Documentation/device-mapper/zero.rst @@ -0,0 +1,37 @@ +======= +dm-zero +======= + +Device-Mapper's "zero" target provides a block-device that always returns +zero'd data on reads and silently drops writes. This is similar behavior to +/dev/zero, but as a block-device instead of a character-device. + +Dm-zero has no target-specific parameters. + +One very interesting use of dm-zero is for creating "sparse" devices in +conjunction with dm-snapshot. A sparse device reports a device-size larger +than the amount of actual storage space available for that device. A user can +write data anywhere within the sparse device and read it back like a normal +device. Reads to previously unwritten areas will return a zero'd buffer. When +enough data has been written to fill up the actual storage space, the sparse +device is deactivated. This can be very useful for testing device and +filesystem limitations. + +To create a sparse device, start by creating a dm-zero device that's the +desired size of the sparse device. For this example, we'll assume a 10TB +sparse device:: + + TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors + echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 + +Then create a snapshot of the zero device, using any available block-device as +the COW device. The size of the COW device will determine the amount of real +space available to the sparse device. For this example, we'll assume /dev/sdb1 +is an available 10GB partition:: + + echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ + dmsetup create sparse1 + +This will create a 10TB sparse device called /dev/mapper/sparse1 that has +10GB of actual storage space available. If more than 10GB of data is written +to this device, it will start returning I/O errors. diff --git a/Documentation/device-mapper/zero.txt b/Documentation/device-mapper/zero.txt deleted file mode 100644 index 20fb38e7fa7e..000000000000 --- a/Documentation/device-mapper/zero.txt +++ /dev/null @@ -1,37 +0,0 @@ -dm-zero -======= - -Device-Mapper's "zero" target provides a block-device that always returns -zero'd data on reads and silently drops writes. This is similar behavior to -/dev/zero, but as a block-device instead of a character-device. - -Dm-zero has no target-specific parameters. - -One very interesting use of dm-zero is for creating "sparse" devices in -conjunction with dm-snapshot. A sparse device reports a device-size larger -than the amount of actual storage space available for that device. A user can -write data anywhere within the sparse device and read it back like a normal -device. Reads to previously unwritten areas will return a zero'd buffer. When -enough data has been written to fill up the actual storage space, the sparse -device is deactivated. This can be very useful for testing device and -filesystem limitations. - -To create a sparse device, start by creating a dm-zero device that's the -desired size of the sparse device. For this example, we'll assume a 10TB -sparse device. - -TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors -echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 - -Then create a snapshot of the zero device, using any available block-device as -the COW device. The size of the COW device will determine the amount of real -space available to the sparse device. For this example, we'll assume /dev/sdb1 -is an available 10GB partition. - -echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ - dmsetup create sparse1 - -This will create a 10TB sparse device called /dev/mapper/sparse1 that has -10GB of actual storage space available. If more than 10GB of data is written -to this device, it will start returning I/O errors. - diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md index 028b3e2e25f9..23e698167141 100644 --- a/Documentation/filesystems/ubifs-authentication.md +++ b/Documentation/filesystems/ubifs-authentication.md @@ -417,9 +417,9 @@ will then have to be provided beforehand in the normal way. [DMC-CBC-ATTACK] http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/ -[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt +[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.rst -[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.txt +[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.rst [FSCRYPT-POLICY2] https://www.spinics.net/lists/linux-ext4/msg58710.html -- cgit From 10ffebbed5503b1830c7920ef528075785351be6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:44 -0300 Subject: docs: fault-injection: convert docs to ReST and rename to *.rst The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Federico Vaga Signed-off-by: Jonathan Corbet --- Documentation/fault-injection/fault-injection.rst | 446 +++++++++++++++++++++ Documentation/fault-injection/fault-injection.txt | 435 -------------------- Documentation/fault-injection/index.rst | 20 + .../fault-injection/notifier-error-inject.rst | 98 +++++ .../fault-injection/notifier-error-inject.txt | 94 ----- .../fault-injection/nvme-fault-injection.rst | 120 ++++++ .../fault-injection/nvme-fault-injection.txt | 116 ------ Documentation/fault-injection/provoke-crashes.rst | 48 +++ Documentation/fault-injection/provoke-crashes.txt | 38 -- Documentation/process/4.Coding.rst | 2 +- .../translations/it_IT/process/4.Coding.rst | 2 +- .../translations/zh_CN/process/4.Coding.rst | 2 +- 12 files changed, 735 insertions(+), 686 deletions(-) create mode 100644 Documentation/fault-injection/fault-injection.rst delete mode 100644 Documentation/fault-injection/fault-injection.txt create mode 100644 Documentation/fault-injection/index.rst create mode 100644 Documentation/fault-injection/notifier-error-inject.rst delete mode 100644 Documentation/fault-injection/notifier-error-inject.txt create mode 100644 Documentation/fault-injection/nvme-fault-injection.rst delete mode 100644 Documentation/fault-injection/nvme-fault-injection.txt create mode 100644 Documentation/fault-injection/provoke-crashes.rst delete mode 100644 Documentation/fault-injection/provoke-crashes.txt (limited to 'Documentation') diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst new file mode 100644 index 000000000000..f51bb21d20e4 --- /dev/null +++ b/Documentation/fault-injection/fault-injection.rst @@ -0,0 +1,446 @@ +=========================================== +Fault injection capabilities infrastructure +=========================================== + +See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug. + + +Available fault injection capabilities +-------------------------------------- + +- failslab + + injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...) + +- fail_page_alloc + + injects page allocation failures. (alloc_pages(), get_free_pages(), ...) + +- fail_futex + + injects futex deadlock and uaddr fault errors. + +- fail_make_request + + injects disk IO errors on devices permitted by setting + /sys/block//make-it-fail or + /sys/block///make-it-fail. (generic_make_request()) + +- fail_mmc_request + + injects MMC data errors on devices permitted by setting + debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request + +- fail_function + + injects error return on specific functions, which are marked by + ALLOW_ERROR_INJECTION() macro, by setting debugfs entries + under /sys/kernel/debug/fail_function. No boot option supported. + +- NVMe fault injection + + inject NVMe status code and retry flag on devices permitted by setting + debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default + status code is NVME_SC_INVALID_OPCODE with no retry. The status code and + retry flag can be set via the debugfs. + + +Configure fault-injection capabilities behavior +----------------------------------------------- + +debugfs entries +^^^^^^^^^^^^^^^ + +fault-inject-debugfs kernel module provides some debugfs entries for runtime +configuration of fault-injection capabilities. + +- /sys/kernel/debug/fail*/probability: + + likelihood of failure injection, in percent. + + Format: + + Note that one-failure-per-hundred is a very high error rate + for some testcases. Consider setting probability=100 and configure + /sys/kernel/debug/fail*/interval for such testcases. + +- /sys/kernel/debug/fail*/interval: + + specifies the interval between failures, for calls to + should_fail() that pass all the other tests. + + Note that if you enable this, by setting interval>1, you will + probably want to set probability=100. + +- /sys/kernel/debug/fail*/times: + + specifies how many times failures may happen at most. + A value of -1 means "no limit". + +- /sys/kernel/debug/fail*/space: + + specifies an initial resource "budget", decremented by "size" + on each call to should_fail(,size). Failure injection is + suppressed until "space" reaches zero. + +- /sys/kernel/debug/fail*/verbose + + Format: { 0 | 1 | 2 } + + specifies the verbosity of the messages when failure is + injected. '0' means no messages; '1' will print only a single + log line per failure; '2' will print a call trace too -- useful + to debug the problems revealed by fault injection. + +- /sys/kernel/debug/fail*/task-filter: + + Format: { 'Y' | 'N' } + + A value of 'N' disables filtering by process (default). + Any positive value limits failures to only processes indicated by + /proc//make-it-fail==1. + +- /sys/kernel/debug/fail*/require-start, + /sys/kernel/debug/fail*/require-end, + /sys/kernel/debug/fail*/reject-start, + /sys/kernel/debug/fail*/reject-end: + + specifies the range of virtual addresses tested during + stacktrace walking. Failure is injected only if some caller + in the walked stacktrace lies within the required range, and + none lies within the rejected range. + Default required range is [0,ULONG_MAX) (whole of virtual address space). + Default rejected range is [0,0). + +- /sys/kernel/debug/fail*/stacktrace-depth: + + specifies the maximum stacktrace depth walked during search + for a caller within [require-start,require-end) OR + [reject-start,reject-end). + +- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' won't inject failures into + highmem/user allocations. + +- /sys/kernel/debug/failslab/ignore-gfp-wait: +- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will inject failures + only into non-sleep allocations (GFP_ATOMIC allocations). + +- /sys/kernel/debug/fail_page_alloc/min-order: + + specifies the minimum page allocation order to be injected + failures. + +- /sys/kernel/debug/fail_futex/ignore-private: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable failure injections + when dealing with private (address space) futexes. + +- /sys/kernel/debug/fail_function/inject: + + Format: { 'function-name' | '!function-name' | '' } + + specifies the target function of error injection by name. + If the function name leads '!' prefix, given function is + removed from injection list. If nothing specified ('') + injection list is cleared. + +- /sys/kernel/debug/fail_function/injectable: + + (read only) shows error injectable functions and what type of + error values can be specified. The error type will be one of + below; + - NULL: retval must be 0. + - ERRNO: retval must be -1 to -MAX_ERRNO (-4096). + - ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096). + +- /sys/kernel/debug/fail_function//retval: + + specifies the "error" return value to inject to the given + function for given function. This will be created when + user specifies new injection entry. + +Boot option +^^^^^^^^^^^ + +In order to inject faults while debugfs is not available (early boot time), +use the boot option:: + + failslab= + fail_page_alloc= + fail_make_request= + fail_futex= + mmc_core.fail_request=,,, + +proc entries +^^^^^^^^^^^^ + +- /proc//fail-nth, + /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the task fail. + Read from this file returns a integer value. A value of '0' indicates + that the fault setup with a previous write to this file was injected. + A positive integer N indicates that the fault wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + +How to add new fault injection capability +----------------------------------------- + +- #include + +- define the fault attributes + + DECLARE_FAULT_ATTR(name); + + Please see the definition of struct fault_attr in fault-inject.h + for details. + +- provide a way to configure fault attributes + +- boot option + + If you need to enable the fault injection capability from boot time, you can + provide boot option to configure it. There is a helper function for it: + + setup_fault_attr(attr, str); + +- debugfs entries + + failslab, fail_page_alloc, and fail_make_request use this way. + Helper functions: + + fault_create_debugfs_attr(name, parent, attr); + +- module parameters + + If the scope of the fault injection capability is limited to a + single kernel module, it is better to provide module parameters to + configure the fault attributes. + +- add a hook to insert failures + + Upon should_fail() returning true, client code should inject a failure: + + should_fail(attr, size); + +Application Examples +-------------------- + +- Inject slab allocation failures into module init/exit code:: + + #!/bin/bash + + FAILTYPE=failslab + echo Y > /sys/kernel/debug/$FAILTYPE/task-filter + echo 10 > /sys/kernel/debug/$FAILTYPE/probability + echo 100 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 2 > /sys/kernel/debug/$FAILTYPE/verbose + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait + + faulty_system() + { + bash -c "echo 1 > /proc/self/make-it-fail && exec $*" + } + + if [ $# -eq 0 ] + then + echo "Usage: $0 modulename [ modulename ... ]" + exit 1 + fi + + for m in $* + do + echo inserting $m... + faulty_system modprobe $m + + echo removing $m... + faulty_system modprobe -r $m + done + +------------------------------------------------------------------------------ + +- Inject page allocation failures only for a specific module:: + + #!/bin/bash + + FAILTYPE=fail_page_alloc + module=$1 + + if [ -z $module ] + then + echo "Usage: $0 " + exit 1 + fi + + modprobe $module + + if [ ! -d /sys/module/$module/sections ] + then + echo Module $module is not loaded + exit 1 + fi + + cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start + cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end + + echo N > /sys/kernel/debug/$FAILTYPE/task-filter + echo 10 > /sys/kernel/debug/$FAILTYPE/probability + echo 100 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 2 > /sys/kernel/debug/$FAILTYPE/verbose + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem + echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth + + trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT + + echo "Injecting errors into the module $module... (interrupt to stop)" + sleep 1000000 + +------------------------------------------------------------------------------ + +- Inject open_ctree error while btrfs mount:: + + #!/bin/bash + + rm -f testfile.img + dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 + DEVICE=$(losetup --show -f testfile.img) + mkfs.btrfs -f $DEVICE + mkdir -p tmpmnt + + FAILTYPE=fail_function + FAILFUNC=open_ctree + echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject + echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval + echo N > /sys/kernel/debug/$FAILTYPE/task-filter + echo 100 > /sys/kernel/debug/$FAILTYPE/probability + echo 0 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 1 > /sys/kernel/debug/$FAILTYPE/verbose + + mount -t btrfs $DEVICE tmpmnt + if [ $? -ne 0 ] + then + echo "SUCCESS!" + else + echo "FAILED!" + umount tmpmnt + fi + + echo > /sys/kernel/debug/$FAILTYPE/inject + + rmdir tmpmnt + losetup -d $DEVICE + rm testfile.img + + +Tool to run command with failslab or fail_page_alloc +---------------------------------------------------- +In order to make it easier to accomplish the tasks mentioned above, we can use +tools/testing/fault-injection/failcmd.sh. Please run a command +"./tools/testing/fault-injection/failcmd.sh --help" for more information and +see the following examples. + +Examples: + +Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab +allocation failure:: + + # ./tools/testing/fault-injection/failcmd.sh \ + -- make -C tools/testing/selftests/ run_tests + +Same as above except to specify 100 times failures at most instead of one time +at most by default:: + + # ./tools/testing/fault-injection/failcmd.sh --times=100 \ + -- make -C tools/testing/selftests/ run_tests + +Same as above except to inject page allocation failure instead of slab +allocation failure:: + + # env FAILCMD_TYPE=fail_page_alloc \ + ./tools/testing/fault-injection/failcmd.sh --times=100 \ + -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call:: + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + int main() + { + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 1;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + pread(fail_nth, buf, sizeof(buf), 0); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y', + res, err); + if (atoi(buf)) + break; + } + return 0; + } + +An example output:: + + 1-th fault Y: res=-1/23 + 2-th fault Y: res=-1/23 + 3-th fault Y: res=-1/12 + 4-th fault Y: res=-1/12 + 5-th fault Y: res=-1/23 + 6-th fault Y: res=-1/23 + 7-th fault Y: res=-1/23 + 8-th fault Y: res=-1/12 + 9-th fault Y: res=-1/12 + 10-th fault Y: res=-1/12 + 11-th fault Y: res=-1/12 + 12-th fault Y: res=-1/12 + 13-th fault Y: res=-1/12 + 14-th fault Y: res=-1/12 + 15-th fault Y: res=-1/12 + 16-th fault N: res=0/12 diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt deleted file mode 100644 index a17517a083c3..000000000000 --- a/Documentation/fault-injection/fault-injection.txt +++ /dev/null @@ -1,435 +0,0 @@ -Fault injection capabilities infrastructure -=========================================== - -See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug. - - -Available fault injection capabilities --------------------------------------- - -o failslab - - injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...) - -o fail_page_alloc - - injects page allocation failures. (alloc_pages(), get_free_pages(), ...) - -o fail_futex - - injects futex deadlock and uaddr fault errors. - -o fail_make_request - - injects disk IO errors on devices permitted by setting - /sys/block//make-it-fail or - /sys/block///make-it-fail. (generic_make_request()) - -o fail_mmc_request - - injects MMC data errors on devices permitted by setting - debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request - -o fail_function - - injects error return on specific functions, which are marked by - ALLOW_ERROR_INJECTION() macro, by setting debugfs entries - under /sys/kernel/debug/fail_function. No boot option supported. - -o NVMe fault injection - - inject NVMe status code and retry flag on devices permitted by setting - debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default - status code is NVME_SC_INVALID_OPCODE with no retry. The status code and - retry flag can be set via the debugfs. - - -Configure fault-injection capabilities behavior ------------------------------------------------ - -o debugfs entries - -fault-inject-debugfs kernel module provides some debugfs entries for runtime -configuration of fault-injection capabilities. - -- /sys/kernel/debug/fail*/probability: - - likelihood of failure injection, in percent. - Format: - - Note that one-failure-per-hundred is a very high error rate - for some testcases. Consider setting probability=100 and configure - /sys/kernel/debug/fail*/interval for such testcases. - -- /sys/kernel/debug/fail*/interval: - - specifies the interval between failures, for calls to - should_fail() that pass all the other tests. - - Note that if you enable this, by setting interval>1, you will - probably want to set probability=100. - -- /sys/kernel/debug/fail*/times: - - specifies how many times failures may happen at most. - A value of -1 means "no limit". - -- /sys/kernel/debug/fail*/space: - - specifies an initial resource "budget", decremented by "size" - on each call to should_fail(,size). Failure injection is - suppressed until "space" reaches zero. - -- /sys/kernel/debug/fail*/verbose - - Format: { 0 | 1 | 2 } - specifies the verbosity of the messages when failure is - injected. '0' means no messages; '1' will print only a single - log line per failure; '2' will print a call trace too -- useful - to debug the problems revealed by fault injection. - -- /sys/kernel/debug/fail*/task-filter: - - Format: { 'Y' | 'N' } - A value of 'N' disables filtering by process (default). - Any positive value limits failures to only processes indicated by - /proc//make-it-fail==1. - -- /sys/kernel/debug/fail*/require-start: -- /sys/kernel/debug/fail*/require-end: -- /sys/kernel/debug/fail*/reject-start: -- /sys/kernel/debug/fail*/reject-end: - - specifies the range of virtual addresses tested during - stacktrace walking. Failure is injected only if some caller - in the walked stacktrace lies within the required range, and - none lies within the rejected range. - Default required range is [0,ULONG_MAX) (whole of virtual address space). - Default rejected range is [0,0). - -- /sys/kernel/debug/fail*/stacktrace-depth: - - specifies the maximum stacktrace depth walked during search - for a caller within [require-start,require-end) OR - [reject-start,reject-end). - -- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem: - - Format: { 'Y' | 'N' } - default is 'N', setting it to 'Y' won't inject failures into - highmem/user allocations. - -- /sys/kernel/debug/failslab/ignore-gfp-wait: -- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait: - - Format: { 'Y' | 'N' } - default is 'N', setting it to 'Y' will inject failures - only into non-sleep allocations (GFP_ATOMIC allocations). - -- /sys/kernel/debug/fail_page_alloc/min-order: - - specifies the minimum page allocation order to be injected - failures. - -- /sys/kernel/debug/fail_futex/ignore-private: - - Format: { 'Y' | 'N' } - default is 'N', setting it to 'Y' will disable failure injections - when dealing with private (address space) futexes. - -- /sys/kernel/debug/fail_function/inject: - - Format: { 'function-name' | '!function-name' | '' } - specifies the target function of error injection by name. - If the function name leads '!' prefix, given function is - removed from injection list. If nothing specified ('') - injection list is cleared. - -- /sys/kernel/debug/fail_function/injectable: - - (read only) shows error injectable functions and what type of - error values can be specified. The error type will be one of - below; - - NULL: retval must be 0. - - ERRNO: retval must be -1 to -MAX_ERRNO (-4096). - - ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096). - -- /sys/kernel/debug/fail_function//retval: - - specifies the "error" return value to inject to the given - function for given function. This will be created when - user specifies new injection entry. - -o Boot option - -In order to inject faults while debugfs is not available (early boot time), -use the boot option: - - failslab= - fail_page_alloc= - fail_make_request= - fail_futex= - mmc_core.fail_request=,,, - -o proc entries - -- /proc//fail-nth: -- /proc/self/task//fail-nth: - - Write to this file of integer N makes N-th call in the task fail. - Read from this file returns a integer value. A value of '0' indicates - that the fault setup with a previous write to this file was injected. - A positive integer N indicates that the fault wasn't yet injected. - Note that this file enables all types of faults (slab, futex, etc). - This setting takes precedence over all other generic debugfs settings - like probability, interval, times, etc. But per-capability settings - (e.g. fail_futex/ignore-private) take precedence over it. - - This feature is intended for systematic testing of faults in a single - system call. See an example below. - -How to add new fault injection capability ------------------------------------------ - -o #include - -o define the fault attributes - - DECLARE_FAULT_ATTR(name); - - Please see the definition of struct fault_attr in fault-inject.h - for details. - -o provide a way to configure fault attributes - -- boot option - - If you need to enable the fault injection capability from boot time, you can - provide boot option to configure it. There is a helper function for it: - - setup_fault_attr(attr, str); - -- debugfs entries - - failslab, fail_page_alloc, and fail_make_request use this way. - Helper functions: - - fault_create_debugfs_attr(name, parent, attr); - -- module parameters - - If the scope of the fault injection capability is limited to a - single kernel module, it is better to provide module parameters to - configure the fault attributes. - -o add a hook to insert failures - - Upon should_fail() returning true, client code should inject a failure. - - should_fail(attr, size); - -Application Examples --------------------- - -o Inject slab allocation failures into module init/exit code - -#!/bin/bash - -FAILTYPE=failslab -echo Y > /sys/kernel/debug/$FAILTYPE/task-filter -echo 10 > /sys/kernel/debug/$FAILTYPE/probability -echo 100 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 2 > /sys/kernel/debug/$FAILTYPE/verbose -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait - -faulty_system() -{ - bash -c "echo 1 > /proc/self/make-it-fail && exec $*" -} - -if [ $# -eq 0 ] -then - echo "Usage: $0 modulename [ modulename ... ]" - exit 1 -fi - -for m in $* -do - echo inserting $m... - faulty_system modprobe $m - - echo removing $m... - faulty_system modprobe -r $m -done - ------------------------------------------------------------------------------- - -o Inject page allocation failures only for a specific module - -#!/bin/bash - -FAILTYPE=fail_page_alloc -module=$1 - -if [ -z $module ] -then - echo "Usage: $0 " - exit 1 -fi - -modprobe $module - -if [ ! -d /sys/module/$module/sections ] -then - echo Module $module is not loaded - exit 1 -fi - -cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start -cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end - -echo N > /sys/kernel/debug/$FAILTYPE/task-filter -echo 10 > /sys/kernel/debug/$FAILTYPE/probability -echo 100 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 2 > /sys/kernel/debug/$FAILTYPE/verbose -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem -echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth - -trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT - -echo "Injecting errors into the module $module... (interrupt to stop)" -sleep 1000000 - ------------------------------------------------------------------------------- - -o Inject open_ctree error while btrfs mount - -#!/bin/bash - -rm -f testfile.img -dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 -DEVICE=$(losetup --show -f testfile.img) -mkfs.btrfs -f $DEVICE -mkdir -p tmpmnt - -FAILTYPE=fail_function -FAILFUNC=open_ctree -echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject -echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval -echo N > /sys/kernel/debug/$FAILTYPE/task-filter -echo 100 > /sys/kernel/debug/$FAILTYPE/probability -echo 0 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 1 > /sys/kernel/debug/$FAILTYPE/verbose - -mount -t btrfs $DEVICE tmpmnt -if [ $? -ne 0 ] -then - echo "SUCCESS!" -else - echo "FAILED!" - umount tmpmnt -fi - -echo > /sys/kernel/debug/$FAILTYPE/inject - -rmdir tmpmnt -losetup -d $DEVICE -rm testfile.img - - -Tool to run command with failslab or fail_page_alloc ----------------------------------------------------- -In order to make it easier to accomplish the tasks mentioned above, we can use -tools/testing/fault-injection/failcmd.sh. Please run a command -"./tools/testing/fault-injection/failcmd.sh --help" for more information and -see the following examples. - -Examples: - -Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab -allocation failure. - - # ./tools/testing/fault-injection/failcmd.sh \ - -- make -C tools/testing/selftests/ run_tests - -Same as above except to specify 100 times failures at most instead of one time -at most by default. - - # ./tools/testing/fault-injection/failcmd.sh --times=100 \ - -- make -C tools/testing/selftests/ run_tests - -Same as above except to inject page allocation failure instead of slab -allocation failure. - - # env FAILCMD_TYPE=fail_page_alloc \ - ./tools/testing/fault-injection/failcmd.sh --times=100 \ - -- make -C tools/testing/selftests/ run_tests - -Systematic faults using fail-nth ---------------------------------- - -The following code systematically faults 0-th, 1-st, 2-nd and so on -capabilities in the socketpair() system call. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main() -{ - int i, err, res, fail_nth, fds[2]; - char buf[128]; - - system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); - sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); - fail_nth = open(buf, O_RDWR); - for (i = 1;; i++) { - sprintf(buf, "%d", i); - write(fail_nth, buf, strlen(buf)); - res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); - err = errno; - pread(fail_nth, buf, sizeof(buf), 0); - if (res == 0) { - close(fds[0]); - close(fds[1]); - } - printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y', - res, err); - if (atoi(buf)) - break; - } - return 0; -} - -An example output: - -1-th fault Y: res=-1/23 -2-th fault Y: res=-1/23 -3-th fault Y: res=-1/12 -4-th fault Y: res=-1/12 -5-th fault Y: res=-1/23 -6-th fault Y: res=-1/23 -7-th fault Y: res=-1/23 -8-th fault Y: res=-1/12 -9-th fault Y: res=-1/12 -10-th fault Y: res=-1/12 -11-th fault Y: res=-1/12 -12-th fault Y: res=-1/12 -13-th fault Y: res=-1/12 -14-th fault Y: res=-1/12 -15-th fault Y: res=-1/12 -16-th fault N: res=0/12 diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst new file mode 100644 index 000000000000..92b5639ed07a --- /dev/null +++ b/Documentation/fault-injection/index.rst @@ -0,0 +1,20 @@ +:orphan: + +=============== +fault-injection +=============== + +.. toctree:: + :maxdepth: 1 + + fault-injection + notifier-error-inject + nvme-fault-injection + provoke-crashes + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/fault-injection/notifier-error-inject.rst b/Documentation/fault-injection/notifier-error-inject.rst new file mode 100644 index 000000000000..1668b6e48d3a --- /dev/null +++ b/Documentation/fault-injection/notifier-error-inject.rst @@ -0,0 +1,98 @@ +Notifier error injection +======================== + +Notifier error injection provides the ability to inject artificial errors to +specified notifier chain callbacks. It is useful to test the error handling of +notifier call chain failures which is rarely executed. There are kernel +modules that can be used to test the following notifiers. + + * PM notifier + * Memory hotplug notifier + * powerpc pSeries reconfig notifier + * Netdevice notifier + +PM notifier error injection module +---------------------------------- +This feature is controlled through debugfs interface + + /sys/kernel/debug/notifier-error-inject/pm/actions//error + +Possible PM notifier events to be failed are: + + * PM_HIBERNATION_PREPARE + * PM_SUSPEND_PREPARE + * PM_RESTORE_PREPARE + +Example: Inject PM suspend error (-12 = -ENOMEM):: + + # cd /sys/kernel/debug/notifier-error-inject/pm/ + # echo -12 > actions/PM_SUSPEND_PREPARE/error + # echo mem > /sys/power/state + bash: echo: write error: Cannot allocate memory + +Memory hotplug notifier error injection module +---------------------------------------------- +This feature is controlled through debugfs interface + + /sys/kernel/debug/notifier-error-inject/memory/actions//error + +Possible memory notifier events to be failed are: + + * MEM_GOING_ONLINE + * MEM_GOING_OFFLINE + +Example: Inject memory hotplug offline error (-12 == -ENOMEM):: + + # cd /sys/kernel/debug/notifier-error-inject/memory + # echo -12 > actions/MEM_GOING_OFFLINE/error + # echo offline > /sys/devices/system/memory/memoryXXX/state + bash: echo: write error: Cannot allocate memory + +powerpc pSeries reconfig notifier error injection module +-------------------------------------------------------- +This feature is controlled through debugfs interface + + /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions//error + +Possible pSeries reconfig notifier events to be failed are: + + * PSERIES_RECONFIG_ADD + * PSERIES_RECONFIG_REMOVE + * PSERIES_DRCONF_MEM_ADD + * PSERIES_DRCONF_MEM_REMOVE + +Netdevice notifier error injection module +---------------------------------------------- +This feature is controlled through debugfs interface + + /sys/kernel/debug/notifier-error-inject/netdev/actions//error + +Netdevice notifier events which can be failed are: + + * NETDEV_REGISTER + * NETDEV_CHANGEMTU + * NETDEV_CHANGENAME + * NETDEV_PRE_UP + * NETDEV_PRE_TYPE_CHANGE + * NETDEV_POST_INIT + * NETDEV_PRECHANGEMTU + * NETDEV_PRECHANGEUPPER + * NETDEV_CHANGEUPPER + +Example: Inject netdevice mtu change error (-22 == -EINVAL):: + + # cd /sys/kernel/debug/notifier-error-inject/netdev + # echo -22 > actions/NETDEV_CHANGEMTU/error + # ip link set eth0 mtu 1024 + RTNETLINK answers: Invalid argument + +For more usage examples +----------------------- +There are tools/testing/selftests using the notifier error injection features +for CPU and memory notifiers. + + * tools/testing/selftests/cpu-hotplug/on-off-test.sh + * tools/testing/selftests/memory-hotplug/on-off-test.sh + +These scripts first do simple online and offline tests and then do fault +injection tests if notifier error injection module is available. diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.txt deleted file mode 100644 index e861d761de24..000000000000 --- a/Documentation/fault-injection/notifier-error-inject.txt +++ /dev/null @@ -1,94 +0,0 @@ -Notifier error injection -======================== - -Notifier error injection provides the ability to inject artificial errors to -specified notifier chain callbacks. It is useful to test the error handling of -notifier call chain failures which is rarely executed. There are kernel -modules that can be used to test the following notifiers. - - * PM notifier - * Memory hotplug notifier - * powerpc pSeries reconfig notifier - * Netdevice notifier - -PM notifier error injection module ----------------------------------- -This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/pm/actions//error - -Possible PM notifier events to be failed are: - - * PM_HIBERNATION_PREPARE - * PM_SUSPEND_PREPARE - * PM_RESTORE_PREPARE - -Example: Inject PM suspend error (-12 = -ENOMEM) - - # cd /sys/kernel/debug/notifier-error-inject/pm/ - # echo -12 > actions/PM_SUSPEND_PREPARE/error - # echo mem > /sys/power/state - bash: echo: write error: Cannot allocate memory - -Memory hotplug notifier error injection module ----------------------------------------------- -This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/memory/actions//error - -Possible memory notifier events to be failed are: - - * MEM_GOING_ONLINE - * MEM_GOING_OFFLINE - -Example: Inject memory hotplug offline error (-12 == -ENOMEM) - - # cd /sys/kernel/debug/notifier-error-inject/memory - # echo -12 > actions/MEM_GOING_OFFLINE/error - # echo offline > /sys/devices/system/memory/memoryXXX/state - bash: echo: write error: Cannot allocate memory - -powerpc pSeries reconfig notifier error injection module --------------------------------------------------------- -This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions//error - -Possible pSeries reconfig notifier events to be failed are: - - * PSERIES_RECONFIG_ADD - * PSERIES_RECONFIG_REMOVE - * PSERIES_DRCONF_MEM_ADD - * PSERIES_DRCONF_MEM_REMOVE - -Netdevice notifier error injection module ----------------------------------------------- -This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/netdev/actions//error - -Netdevice notifier events which can be failed are: - - * NETDEV_REGISTER - * NETDEV_CHANGEMTU - * NETDEV_CHANGENAME - * NETDEV_PRE_UP - * NETDEV_PRE_TYPE_CHANGE - * NETDEV_POST_INIT - * NETDEV_PRECHANGEMTU - * NETDEV_PRECHANGEUPPER - * NETDEV_CHANGEUPPER - -Example: Inject netdevice mtu change error (-22 == -EINVAL) - - # cd /sys/kernel/debug/notifier-error-inject/netdev - # echo -22 > actions/NETDEV_CHANGEMTU/error - # ip link set eth0 mtu 1024 - RTNETLINK answers: Invalid argument - -For more usage examples ------------------------ -There are tools/testing/selftests using the notifier error injection features -for CPU and memory notifiers. - - * tools/testing/selftests/cpu-hotplug/on-off-test.sh - * tools/testing/selftests/memory-hotplug/on-off-test.sh - -These scripts first do simple online and offline tests and then do fault -injection tests if notifier error injection module is available. diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst new file mode 100644 index 000000000000..bbb1bf3e8650 --- /dev/null +++ b/Documentation/fault-injection/nvme-fault-injection.rst @@ -0,0 +1,120 @@ +NVMe Fault Injection +==================== +Linux's fault injection framework provides a systematic way to support +error injection via debugfs in the /sys/kernel/debug directory. When +enabled, the default NVME_SC_INVALID_OPCODE with no retry will be +injected into the nvme_end_request. Users can change the default status +code and no retry flag via the debugfs. The list of Generic Command +Status can be found in include/linux/nvme.h + +Following examples show how to inject an error into the nvme. + +First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config, +recompile the kernel. After booting up the kernel, do the +following. + +Example 1: Inject default status code with no retry +--------------------------------------------------- + +:: + + mount /dev/nvme0n1 /mnt + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times + echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability + cp a.file /mnt + +Expected Result:: + + cp: cannot stat ‘/mnt/a.file’: Input/output error + +Message from dmesg:: + + FAULT_INJECTION: forcing a failure. + name fault_inject, interval 1, probability 100, space 0, times 1 + CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2 + Hardware name: innotek GmbH VirtualBox/VirtualBox, + BIOS VirtualBox 12/01/2006 + Call Trace: + + dump_stack+0x5c/0x7d + should_fail+0x148/0x170 + nvme_should_fail+0x2f/0x50 [nvme_core] + nvme_process_cq+0xe7/0x1d0 [nvme] + nvme_irq+0x1e/0x40 [nvme] + __handle_irq_event_percpu+0x3a/0x190 + handle_irq_event_percpu+0x30/0x70 + handle_irq_event+0x36/0x60 + handle_fasteoi_irq+0x78/0x120 + handle_irq+0xa7/0x130 + ? tick_irq_enter+0xa8/0xc0 + do_IRQ+0x43/0xc0 + common_interrupt+0xa2/0xa2 + + RIP: 0010:native_safe_halt+0x2/0x10 + RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd + RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 + RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480 + R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000 + ? __sched_text_end+0x4/0x4 + default_idle+0x18/0xf0 + do_idle+0x150/0x1d0 + cpu_startup_entry+0x6f/0x80 + start_kernel+0x4c4/0x4e4 + ? set_init_arg+0x55/0x55 + secondary_startup_64+0xa5/0xb0 + print_req_error: I/O error, dev nvme0n1, sector 9240 + EXT4-fs error (device nvme0n1): ext4_find_entry:1436: + inode #2: comm cp: reading directory lblock 0 + +Example 2: Inject default status code with retry +------------------------------------------------ + +:: + + mount /dev/nvme0n1 /mnt + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times + echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status + echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry + + cp a.file /mnt + +Expected Result:: + + command success without error + +Message from dmesg:: + + FAULT_INJECTION: forcing a failure. + name fault_inject, interval 1, probability 100, space 0, times 1 + CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4 + Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 + Call Trace: + + dump_stack+0x5c/0x7d + should_fail+0x148/0x170 + nvme_should_fail+0x30/0x60 [nvme_core] + nvme_loop_queue_response+0x84/0x110 [nvme_loop] + nvmet_req_complete+0x11/0x40 [nvmet] + nvmet_bio_done+0x28/0x40 [nvmet] + blk_update_request+0xb0/0x310 + blk_mq_end_request+0x18/0x60 + flush_smp_call_function_queue+0x3d/0xf0 + smp_call_function_single_interrupt+0x2c/0xc0 + call_function_single_interrupt+0xa2/0xb0 + + RIP: 0010:native_safe_halt+0x2/0x10 + RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04 + RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 + RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680 + R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000 + ? __sched_text_end+0x4/0x4 + default_idle+0x18/0xf0 + do_idle+0x150/0x1d0 + cpu_startup_entry+0x6f/0x80 + start_secondary+0x187/0x1e0 + secondary_startup_64+0xa5/0xb0 diff --git a/Documentation/fault-injection/nvme-fault-injection.txt b/Documentation/fault-injection/nvme-fault-injection.txt deleted file mode 100644 index 8fbf3bf60b62..000000000000 --- a/Documentation/fault-injection/nvme-fault-injection.txt +++ /dev/null @@ -1,116 +0,0 @@ -NVMe Fault Injection -==================== -Linux's fault injection framework provides a systematic way to support -error injection via debugfs in the /sys/kernel/debug directory. When -enabled, the default NVME_SC_INVALID_OPCODE with no retry will be -injected into the nvme_end_request. Users can change the default status -code and no retry flag via the debugfs. The list of Generic Command -Status can be found in include/linux/nvme.h - -Following examples show how to inject an error into the nvme. - -First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config, -recompile the kernel. After booting up the kernel, do the -following. - -Example 1: Inject default status code with no retry ---------------------------------------------------- - -mount /dev/nvme0n1 /mnt -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times -echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability -cp a.file /mnt - -Expected Result: - -cp: cannot stat ‘/mnt/a.file’: Input/output error - -Message from dmesg: - -FAULT_INJECTION: forcing a failure. -name fault_inject, interval 1, probability 100, space 0, times 1 -CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2 -Hardware name: innotek GmbH VirtualBox/VirtualBox, -BIOS VirtualBox 12/01/2006 -Call Trace: - - dump_stack+0x5c/0x7d - should_fail+0x148/0x170 - nvme_should_fail+0x2f/0x50 [nvme_core] - nvme_process_cq+0xe7/0x1d0 [nvme] - nvme_irq+0x1e/0x40 [nvme] - __handle_irq_event_percpu+0x3a/0x190 - handle_irq_event_percpu+0x30/0x70 - handle_irq_event+0x36/0x60 - handle_fasteoi_irq+0x78/0x120 - handle_irq+0xa7/0x130 - ? tick_irq_enter+0xa8/0xc0 - do_IRQ+0x43/0xc0 - common_interrupt+0xa2/0xa2 - -RIP: 0010:native_safe_halt+0x2/0x10 -RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd -RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000 -RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 -RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000 -R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480 -R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000 - ? __sched_text_end+0x4/0x4 - default_idle+0x18/0xf0 - do_idle+0x150/0x1d0 - cpu_startup_entry+0x6f/0x80 - start_kernel+0x4c4/0x4e4 - ? set_init_arg+0x55/0x55 - secondary_startup_64+0xa5/0xb0 - print_req_error: I/O error, dev nvme0n1, sector 9240 -EXT4-fs error (device nvme0n1): ext4_find_entry:1436: -inode #2: comm cp: reading directory lblock 0 - -Example 2: Inject default status code with retry ------------------------------------------------- - -mount /dev/nvme0n1 /mnt -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times -echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status -echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry - -cp a.file /mnt - -Expected Result: - -command success without error - -Message from dmesg: - -FAULT_INJECTION: forcing a failure. -name fault_inject, interval 1, probability 100, space 0, times 1 -CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4 -Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 -Call Trace: - - dump_stack+0x5c/0x7d - should_fail+0x148/0x170 - nvme_should_fail+0x30/0x60 [nvme_core] - nvme_loop_queue_response+0x84/0x110 [nvme_loop] - nvmet_req_complete+0x11/0x40 [nvmet] - nvmet_bio_done+0x28/0x40 [nvmet] - blk_update_request+0xb0/0x310 - blk_mq_end_request+0x18/0x60 - flush_smp_call_function_queue+0x3d/0xf0 - smp_call_function_single_interrupt+0x2c/0xc0 - call_function_single_interrupt+0xa2/0xb0 - -RIP: 0010:native_safe_halt+0x2/0x10 -RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04 -RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000 -RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 -RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000 -R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680 -R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000 - ? __sched_text_end+0x4/0x4 - default_idle+0x18/0xf0 - do_idle+0x150/0x1d0 - cpu_startup_entry+0x6f/0x80 - start_secondary+0x187/0x1e0 - secondary_startup_64+0xa5/0xb0 diff --git a/Documentation/fault-injection/provoke-crashes.rst b/Documentation/fault-injection/provoke-crashes.rst new file mode 100644 index 000000000000..9279a3e12278 --- /dev/null +++ b/Documentation/fault-injection/provoke-crashes.rst @@ -0,0 +1,48 @@ +=============== +Provoke crashes +=============== + +The lkdtm module provides an interface to crash or injure the kernel at +predefined crashpoints to evaluate the reliability of crash dumps obtained +using different dumping solutions. The module uses KPROBEs to instrument +crashing points, but can also crash the kernel directly without KRPOBE +support. + + +You can provide the way either through module arguments when inserting +the module, or through a debugfs interface. + +Usage:: + + insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> + [cpoint_count={>0}] + +recur_count + Recursion level for the stack overflow test. Default is 10. + +cpoint_name + Crash point where the kernel is to be crashed. It can be + one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, + FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, + IDE_CORE_CP, DIRECT + +cpoint_type + Indicates the action to be taken on hitting the crash point. + It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, + CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, + WRITE_AFTER_FREE, + +cpoint_count + Indicates the number of times the crash point is to be hit + to trigger an action. The default is 10. + +You can also induce failures by mounting debugfs and writing the type to +/provoke-crash/. E.g.:: + + mount -t debugfs debugfs /mnt + echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY + + +A special file is `DIRECT` which will induce the crash directly without +KPROBE instrumentation. This mode is the only one available when the module +is built on a kernel without KPROBEs support. diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt deleted file mode 100644 index 7a9d3d81525b..000000000000 --- a/Documentation/fault-injection/provoke-crashes.txt +++ /dev/null @@ -1,38 +0,0 @@ -The lkdtm module provides an interface to crash or injure the kernel at -predefined crashpoints to evaluate the reliability of crash dumps obtained -using different dumping solutions. The module uses KPROBEs to instrument -crashing points, but can also crash the kernel directly without KRPOBE -support. - - -You can provide the way either through module arguments when inserting -the module, or through a debugfs interface. - -Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> - [cpoint_count={>0}] - - recur_count : Recursion level for the stack overflow test. Default is 10. - - cpoint_name : Crash point where the kernel is to be crashed. It can be - one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, - FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, - IDE_CORE_CP, DIRECT - - cpoint_type : Indicates the action to be taken on hitting the crash point. - It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, - CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, - WRITE_AFTER_FREE, - - cpoint_count : Indicates the number of times the crash point is to be hit - to trigger an action. The default is 10. - -You can also induce failures by mounting debugfs and writing the type to -/provoke-crash/. E.g., - - mount -t debugfs debugfs /mnt - echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY - - -A special file is `DIRECT' which will induce the crash directly without -KPROBE instrumentation. This mode is the only one available when the module -is built on a kernel without KPROBEs support. diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 4b7a5ab3cec1..13dd893c9f88 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -298,7 +298,7 @@ enabled, a configurable percentage of memory allocations will be made to fail; these failures can be restricted to a specific range of code. Running with fault injection enabled allows the programmer to see how the code responds when things go badly. See -Documentation/fault-injection/fault-injection.txt for more information on +Documentation/fault-injection/fault-injection.rst for more information on how to use this facility. Other kinds of errors can be found with the "sparse" static analysis tool. diff --git a/Documentation/translations/it_IT/process/4.Coding.rst b/Documentation/translations/it_IT/process/4.Coding.rst index c05b89e616dd..a5e36aa60448 100644 --- a/Documentation/translations/it_IT/process/4.Coding.rst +++ b/Documentation/translations/it_IT/process/4.Coding.rst @@ -314,7 +314,7 @@ di allocazione di memoria sarà destinata al fallimento; questi fallimenti possono essere ridotti ad uno specifico pezzo di codice. Procedere con l'inserimento dei fallimenti attivo permette al programmatore di verificare come il codice risponde quando le cose vanno male. Consultate: -Documentation/fault-injection/fault-injection.txt per avere maggiori +Documentation/fault-injection/fault-injection.rst per avere maggiori informazioni su come utilizzare questo strumento. Altre tipologie di errori possono essere riscontrati con lo strumento di diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst index 8bb777941394..b82b1dde3122 100644 --- a/Documentation/translations/zh_CN/process/4.Coding.rst +++ b/Documentation/translations/zh_CN/process/4.Coding.rst @@ -205,7 +205,7 @@ Linus对这个问题给出了最佳答案: 启用故障注入后,内存分配的可配置百分比将失败;这些失败可以限制在特定的代码 范围内。在启用了故障注入的情况下运行,程序员可以看到当情况恶化时代码如何响 应。有关如何使用此工具的详细信息,请参阅 -Documentation/fault-injection/fault-injection.txt。 +Documentation/fault-injection/fault-injection.rst。 使用“sparse”静态分析工具可以发现其他类型的错误。对于sparse,可以警告程序员 用户空间和内核空间地址之间的混淆、big endian和small endian数量的混合、在需 -- cgit From ab42b818954c040fa13639dc031d8541edcecb4b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:45 -0300 Subject: docs: fb: convert docs to ReST and rename to *.rst The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Also, removed the Maintained by, as requested by Geert. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/fb/api.rst | 307 ++++++++++++++++ Documentation/fb/api.txt | 306 ---------------- Documentation/fb/arkfb.rst | 68 ++++ Documentation/fb/arkfb.txt | 68 ---- Documentation/fb/aty128fb.rst | 75 ++++ Documentation/fb/aty128fb.txt | 72 ---- Documentation/fb/cirrusfb.rst | 94 +++++ Documentation/fb/cirrusfb.txt | 97 ------ Documentation/fb/cmap_xfbdev.rst | 56 +++ Documentation/fb/cmap_xfbdev.txt | 53 --- Documentation/fb/deferred_io.rst | 79 +++++ Documentation/fb/deferred_io.txt | 75 ---- Documentation/fb/efifb.rst | 39 +++ Documentation/fb/efifb.txt | 37 -- Documentation/fb/ep93xx-fb.rst | 140 ++++++++ Documentation/fb/ep93xx-fb.txt | 135 -------- Documentation/fb/fbcon.rst | 350 +++++++++++++++++++ Documentation/fb/fbcon.txt | 347 ------------------- Documentation/fb/framebuffer.rst | 353 +++++++++++++++++++ Documentation/fb/framebuffer.txt | 343 ------------------ Documentation/fb/gxfb.rst | 54 +++ Documentation/fb/gxfb.txt | 52 --- Documentation/fb/index.rst | 50 +++ Documentation/fb/intel810.rst | 287 +++++++++++++++ Documentation/fb/intel810.txt | 278 --------------- Documentation/fb/intelfb.rst | 155 +++++++++ Documentation/fb/intelfb.txt | 149 -------- Documentation/fb/internals.rst | 86 +++++ Documentation/fb/internals.txt | 82 ----- Documentation/fb/lxfb.rst | 55 +++ Documentation/fb/lxfb.txt | 52 --- Documentation/fb/matroxfb.rst | 443 ++++++++++++++++++++++++ Documentation/fb/matroxfb.txt | 413 ---------------------- Documentation/fb/metronomefb.rst | 38 ++ Documentation/fb/metronomefb.txt | 36 -- Documentation/fb/modedb.rst | 155 +++++++++ Documentation/fb/modedb.txt | 151 -------- Documentation/fb/pvr2fb.rst | 66 ++++ Documentation/fb/pvr2fb.txt | 65 ---- Documentation/fb/pxafb.rst | 173 +++++++++ Documentation/fb/pxafb.txt | 142 -------- Documentation/fb/s3fb.rst | 82 +++++ Documentation/fb/s3fb.txt | 82 ----- Documentation/fb/sa1100fb.rst | 40 +++ Documentation/fb/sa1100fb.txt | 39 --- Documentation/fb/sh7760fb.rst | 130 +++++++ Documentation/fb/sh7760fb.txt | 131 ------- Documentation/fb/sisfb.rst | 160 +++++++++ Documentation/fb/sisfb.txt | 158 --------- Documentation/fb/sm501.rst | 15 + Documentation/fb/sm501.txt | 10 - Documentation/fb/sm712fb.rst | 35 ++ Documentation/fb/sm712fb.txt | 31 -- Documentation/fb/sstfb.rst | 207 +++++++++++ Documentation/fb/sstfb.txt | 174 ---------- Documentation/fb/tgafb.rst | 71 ++++ Documentation/fb/tgafb.txt | 69 ---- Documentation/fb/tridentfb.rst | 78 +++++ Documentation/fb/tridentfb.txt | 70 ---- Documentation/fb/udlfb.rst | 162 +++++++++ Documentation/fb/udlfb.txt | 159 --------- Documentation/fb/uvesafb.rst | 188 ++++++++++ Documentation/fb/uvesafb.txt | 184 ---------- Documentation/fb/vesafb.rst | 192 ++++++++++ Documentation/fb/vesafb.txt | 181 ---------- Documentation/fb/viafb.rst | 297 ++++++++++++++++ Documentation/fb/viafb.txt | 252 -------------- Documentation/fb/vt8623fb.rst | 64 ++++ Documentation/fb/vt8623fb.txt | 64 ---- 70 files changed, 4845 insertions(+), 4558 deletions(-) create mode 100644 Documentation/fb/api.rst delete mode 100644 Documentation/fb/api.txt create mode 100644 Documentation/fb/arkfb.rst delete mode 100644 Documentation/fb/arkfb.txt create mode 100644 Documentation/fb/aty128fb.rst delete mode 100644 Documentation/fb/aty128fb.txt create mode 100644 Documentation/fb/cirrusfb.rst delete mode 100644 Documentation/fb/cirrusfb.txt create mode 100644 Documentation/fb/cmap_xfbdev.rst delete mode 100644 Documentation/fb/cmap_xfbdev.txt create mode 100644 Documentation/fb/deferred_io.rst delete mode 100644 Documentation/fb/deferred_io.txt create mode 100644 Documentation/fb/efifb.rst delete mode 100644 Documentation/fb/efifb.txt create mode 100644 Documentation/fb/ep93xx-fb.rst delete mode 100644 Documentation/fb/ep93xx-fb.txt create mode 100644 Documentation/fb/fbcon.rst delete mode 100644 Documentation/fb/fbcon.txt create mode 100644 Documentation/fb/framebuffer.rst delete mode 100644 Documentation/fb/framebuffer.txt create mode 100644 Documentation/fb/gxfb.rst delete mode 100644 Documentation/fb/gxfb.txt create mode 100644 Documentation/fb/index.rst create mode 100644 Documentation/fb/intel810.rst delete mode 100644 Documentation/fb/intel810.txt create mode 100644 Documentation/fb/intelfb.rst delete mode 100644 Documentation/fb/intelfb.txt create mode 100644 Documentation/fb/internals.rst delete mode 100644 Documentation/fb/internals.txt create mode 100644 Documentation/fb/lxfb.rst delete mode 100644 Documentation/fb/lxfb.txt create mode 100644 Documentation/fb/matroxfb.rst delete mode 100644 Documentation/fb/matroxfb.txt create mode 100644 Documentation/fb/metronomefb.rst delete mode 100644 Documentation/fb/metronomefb.txt create mode 100644 Documentation/fb/modedb.rst delete mode 100644 Documentation/fb/modedb.txt create mode 100644 Documentation/fb/pvr2fb.rst delete mode 100644 Documentation/fb/pvr2fb.txt create mode 100644 Documentation/fb/pxafb.rst delete mode 100644 Documentation/fb/pxafb.txt create mode 100644 Documentation/fb/s3fb.rst delete mode 100644 Documentation/fb/s3fb.txt create mode 100644 Documentation/fb/sa1100fb.rst delete mode 100644 Documentation/fb/sa1100fb.txt create mode 100644 Documentation/fb/sh7760fb.rst delete mode 100644 Documentation/fb/sh7760fb.txt create mode 100644 Documentation/fb/sisfb.rst delete mode 100644 Documentation/fb/sisfb.txt create mode 100644 Documentation/fb/sm501.rst delete mode 100644 Documentation/fb/sm501.txt create mode 100644 Documentation/fb/sm712fb.rst delete mode 100644 Documentation/fb/sm712fb.txt create mode 100644 Documentation/fb/sstfb.rst delete mode 100644 Documentation/fb/sstfb.txt create mode 100644 Documentation/fb/tgafb.rst delete mode 100644 Documentation/fb/tgafb.txt create mode 100644 Documentation/fb/tridentfb.rst delete mode 100644 Documentation/fb/tridentfb.txt create mode 100644 Documentation/fb/udlfb.rst delete mode 100644 Documentation/fb/udlfb.txt create mode 100644 Documentation/fb/uvesafb.rst delete mode 100644 Documentation/fb/uvesafb.txt create mode 100644 Documentation/fb/vesafb.rst delete mode 100644 Documentation/fb/vesafb.txt create mode 100644 Documentation/fb/viafb.rst delete mode 100644 Documentation/fb/viafb.txt create mode 100644 Documentation/fb/vt8623fb.rst delete mode 100644 Documentation/fb/vt8623fb.txt (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9b16b640ce48..83d6560f10f0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5024,7 +5024,7 @@ vector=percpu: enable percpu vector domain video= [FB] Frame buffer configuration - See Documentation/fb/modedb.txt. + See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [0,1] If set to 1, on receiving an ACPI notify event diff --git a/Documentation/fb/api.rst b/Documentation/fb/api.rst new file mode 100644 index 000000000000..79ec33dded74 --- /dev/null +++ b/Documentation/fb/api.rst @@ -0,0 +1,307 @@ +=========================== +The Frame Buffer Device API +=========================== + +Last revised: June 21, 2011 + + +0. Introduction +--------------- + +This document describes the frame buffer API used by applications to interact +with frame buffer devices. In-kernel APIs between device drivers and the frame +buffer core are not described. + +Due to a lack of documentation in the original frame buffer API, drivers +behaviours differ in subtle (and not so subtle) ways. This document describes +the recommended API implementation, but applications should be prepared to +deal with different behaviours. + + +1. Capabilities +--------------- + +Device and driver capabilities are reported in the fixed screen information +capabilities field:: + + struct fb_fix_screeninfo { + ... + __u16 capabilities; /* see FB_CAP_* */ + ... + }; + +Application should use those capabilities to find out what features they can +expect from the device and driver. + +- FB_CAP_FOURCC + +The driver supports the four character code (FOURCC) based format setting API. +When supported, formats are configured using a FOURCC instead of manually +specifying color components layout. + + +2. Types and visuals +-------------------- + +Pixels are stored in memory in hardware-dependent formats. Applications need +to be aware of the pixel storage format in order to write image data to the +frame buffer memory in the format expected by the hardware. + +Formats are described by frame buffer types and visuals. Some visuals require +additional information, which are stored in the variable screen information +bits_per_pixel, grayscale, red, green, blue and transp fields. + +Visuals describe how color information is encoded and assembled to create +macropixels. Types describe how macropixels are stored in memory. The following +types and visuals are supported. + +- FB_TYPE_PACKED_PIXELS + +Macropixels are stored contiguously in a single plane. If the number of bits +per macropixel is not a multiple of 8, whether macropixels are padded to the +next multiple of 8 bits or packed together into bytes depends on the visual. + +Padding at end of lines may be present and is then reported through the fixed +screen information line_length field. + +- FB_TYPE_PLANES + +Macropixels are split across multiple planes. The number of planes is equal to +the number of bits per macropixel, with plane i'th storing i'th bit from all +macropixels. + +Planes are located contiguously in memory. + +- FB_TYPE_INTERLEAVED_PLANES + +Macropixels are split across multiple planes. The number of planes is equal to +the number of bits per macropixel, with plane i'th storing i'th bit from all +macropixels. + +Planes are interleaved in memory. The interleave factor, defined as the +distance in bytes between the beginning of two consecutive interleaved blocks +belonging to different planes, is stored in the fixed screen information +type_aux field. + +- FB_TYPE_FOURCC + +Macropixels are stored in memory as described by the format FOURCC identifier +stored in the variable screen information grayscale field. + +- FB_VISUAL_MONO01 + +Pixels are black or white and stored on a number of bits (typically one) +specified by the variable screen information bpp field. + +Black pixels are represented by all bits set to 1 and white pixels by all bits +set to 0. When the number of bits per pixel is smaller than 8, several pixels +are packed together in a byte. + +FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only. + +- FB_VISUAL_MONO10 + +Pixels are black or white and stored on a number of bits (typically one) +specified by the variable screen information bpp field. + +Black pixels are represented by all bits set to 0 and white pixels by all bits +set to 1. When the number of bits per pixel is smaller than 8, several pixels +are packed together in a byte. + +FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only. + +- FB_VISUAL_TRUECOLOR + +Pixels are broken into red, green and blue components, and each component +indexes a read-only lookup table for the corresponding value. Lookup tables +are device-dependent, and provide linear or non-linear ramps. + +Each component is stored in a macropixel according to the variable screen +information red, green, blue and transp fields. + +- FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR + +Pixel values are encoded as indices into a colormap that stores red, green and +blue components. The colormap is read-only for FB_VISUAL_STATIC_PSEUDOCOLOR +and read-write for FB_VISUAL_PSEUDOCOLOR. + +Each pixel value is stored in the number of bits reported by the variable +screen information bits_per_pixel field. + +- FB_VISUAL_DIRECTCOLOR + +Pixels are broken into red, green and blue components, and each component +indexes a programmable lookup table for the corresponding value. + +Each component is stored in a macropixel according to the variable screen +information red, green, blue and transp fields. + +- FB_VISUAL_FOURCC + +Pixels are encoded and interpreted as described by the format FOURCC +identifier stored in the variable screen information grayscale field. + + +3. Screen information +--------------------- + +Screen information are queried by applications using the FBIOGET_FSCREENINFO +and FBIOGET_VSCREENINFO ioctls. Those ioctls take a pointer to a +fb_fix_screeninfo and fb_var_screeninfo structure respectively. + +struct fb_fix_screeninfo stores device independent unchangeable information +about the frame buffer device and the current format. Those information can't +be directly modified by applications, but can be changed by the driver when an +application modifies the format:: + + struct fb_fix_screeninfo { + char id[16]; /* identification string eg "TT Builtin" */ + unsigned long smem_start; /* Start of frame buffer mem */ + /* (physical address) */ + __u32 smem_len; /* Length of frame buffer mem */ + __u32 type; /* see FB_TYPE_* */ + __u32 type_aux; /* Interleave for interleaved Planes */ + __u32 visual; /* see FB_VISUAL_* */ + __u16 xpanstep; /* zero if no hardware panning */ + __u16 ypanstep; /* zero if no hardware panning */ + __u16 ywrapstep; /* zero if no hardware ywrap */ + __u32 line_length; /* length of a line in bytes */ + unsigned long mmio_start; /* Start of Memory Mapped I/O */ + /* (physical address) */ + __u32 mmio_len; /* Length of Memory Mapped I/O */ + __u32 accel; /* Indicate to driver which */ + /* specific chip/card we have */ + __u16 capabilities; /* see FB_CAP_* */ + __u16 reserved[2]; /* Reserved for future compatibility */ + }; + +struct fb_var_screeninfo stores device independent changeable information +about a frame buffer device, its current format and video mode, as well as +other miscellaneous parameters:: + + struct fb_var_screeninfo { + __u32 xres; /* visible resolution */ + __u32 yres; + __u32 xres_virtual; /* virtual resolution */ + __u32 yres_virtual; + __u32 xoffset; /* offset from virtual to visible */ + __u32 yoffset; /* resolution */ + + __u32 bits_per_pixel; /* guess what */ + __u32 grayscale; /* 0 = color, 1 = grayscale, */ + /* >1 = FOURCC */ + struct fb_bitfield red; /* bitfield in fb mem if true color, */ + struct fb_bitfield green; /* else only length is significant */ + struct fb_bitfield blue; + struct fb_bitfield transp; /* transparency */ + + __u32 nonstd; /* != 0 Non standard pixel format */ + + __u32 activate; /* see FB_ACTIVATE_* */ + + __u32 height; /* height of picture in mm */ + __u32 width; /* width of picture in mm */ + + __u32 accel_flags; /* (OBSOLETE) see fb_info.flags */ + + /* Timing: All values in pixclocks, except pixclock (of course) */ + __u32 pixclock; /* pixel clock in ps (pico seconds) */ + __u32 left_margin; /* time from sync to picture */ + __u32 right_margin; /* time from picture to sync */ + __u32 upper_margin; /* time from sync to picture */ + __u32 lower_margin; + __u32 hsync_len; /* length of horizontal sync */ + __u32 vsync_len; /* length of vertical sync */ + __u32 sync; /* see FB_SYNC_* */ + __u32 vmode; /* see FB_VMODE_* */ + __u32 rotate; /* angle we rotate counter clockwise */ + __u32 colorspace; /* colorspace for FOURCC-based modes */ + __u32 reserved[4]; /* Reserved for future compatibility */ + }; + +To modify variable information, applications call the FBIOPUT_VSCREENINFO +ioctl with a pointer to a fb_var_screeninfo structure. If the call is +successful, the driver will update the fixed screen information accordingly. + +Instead of filling the complete fb_var_screeninfo structure manually, +applications should call the FBIOGET_VSCREENINFO ioctl and modify only the +fields they care about. + + +4. Format configuration +----------------------- + +Frame buffer devices offer two ways to configure the frame buffer format: the +legacy API and the FOURCC-based API. + + +The legacy API has been the only frame buffer format configuration API for a +long time and is thus widely used by application. It is the recommended API +for applications when using RGB and grayscale formats, as well as legacy +non-standard formats. + +To select a format, applications set the fb_var_screeninfo bits_per_pixel field +to the desired frame buffer depth. Values up to 8 will usually map to +monochrome, grayscale or pseudocolor visuals, although this is not required. + +- For grayscale formats, applications set the grayscale field to one. The red, + blue, green and transp fields must be set to 0 by applications and ignored by + drivers. Drivers must fill the red, blue and green offsets to 0 and lengths + to the bits_per_pixel value. + +- For pseudocolor formats, applications set the grayscale field to zero. The + red, blue, green and transp fields must be set to 0 by applications and + ignored by drivers. Drivers must fill the red, blue and green offsets to 0 + and lengths to the bits_per_pixel value. + +- For truecolor and directcolor formats, applications set the grayscale field + to zero, and the red, blue, green and transp fields to describe the layout of + color components in memory:: + + struct fb_bitfield { + __u32 offset; /* beginning of bitfield */ + __u32 length; /* length of bitfield */ + __u32 msb_right; /* != 0 : Most significant bit is */ + /* right */ + }; + + Pixel values are bits_per_pixel wide and are split in non-overlapping red, + green, blue and alpha (transparency) components. Location and size of each + component in the pixel value are described by the fb_bitfield offset and + length fields. Offset are computed from the right. + + Pixels are always stored in an integer number of bytes. If the number of + bits per pixel is not a multiple of 8, pixel values are padded to the next + multiple of 8 bits. + +Upon successful format configuration, drivers update the fb_fix_screeninfo +type, visual and line_length fields depending on the selected format. + + +The FOURCC-based API replaces format descriptions by four character codes +(FOURCC). FOURCCs are abstract identifiers that uniquely define a format +without explicitly describing it. This is the only API that supports YUV +formats. Drivers are also encouraged to implement the FOURCC-based API for RGB +and grayscale formats. + +Drivers that support the FOURCC-based API report this capability by setting +the FB_CAP_FOURCC bit in the fb_fix_screeninfo capabilities field. + +FOURCC definitions are located in the linux/videodev2.h header. However, and +despite starting with the V4L2_PIX_FMT_prefix, they are not restricted to V4L2 +and don't require usage of the V4L2 subsystem. FOURCC documentation is +available in Documentation/media/uapi/v4l/pixfmt.rst. + +To select a format, applications set the grayscale field to the desired FOURCC. +For YUV formats, they should also select the appropriate colorspace by setting +the colorspace field to one of the colorspaces listed in linux/videodev2.h and +documented in Documentation/media/uapi/v4l/colorspaces.rst. + +The red, green, blue and transp fields are not used with the FOURCC-based API. +For forward compatibility reasons applications must zero those fields, and +drivers must ignore them. Values other than 0 may get a meaning in future +extensions. + +Upon successful format configuration, drivers update the fb_fix_screeninfo +type, visual and line_length fields depending on the selected format. The type +and visual fields are set to FB_TYPE_FOURCC and FB_VISUAL_FOURCC respectively. diff --git a/Documentation/fb/api.txt b/Documentation/fb/api.txt deleted file mode 100644 index d52cf1e3b975..000000000000 --- a/Documentation/fb/api.txt +++ /dev/null @@ -1,306 +0,0 @@ - The Frame Buffer Device API - --------------------------- - -Last revised: June 21, 2011 - - -0. Introduction ---------------- - -This document describes the frame buffer API used by applications to interact -with frame buffer devices. In-kernel APIs between device drivers and the frame -buffer core are not described. - -Due to a lack of documentation in the original frame buffer API, drivers -behaviours differ in subtle (and not so subtle) ways. This document describes -the recommended API implementation, but applications should be prepared to -deal with different behaviours. - - -1. Capabilities ---------------- - -Device and driver capabilities are reported in the fixed screen information -capabilities field. - -struct fb_fix_screeninfo { - ... - __u16 capabilities; /* see FB_CAP_* */ - ... -}; - -Application should use those capabilities to find out what features they can -expect from the device and driver. - -- FB_CAP_FOURCC - -The driver supports the four character code (FOURCC) based format setting API. -When supported, formats are configured using a FOURCC instead of manually -specifying color components layout. - - -2. Types and visuals --------------------- - -Pixels are stored in memory in hardware-dependent formats. Applications need -to be aware of the pixel storage format in order to write image data to the -frame buffer memory in the format expected by the hardware. - -Formats are described by frame buffer types and visuals. Some visuals require -additional information, which are stored in the variable screen information -bits_per_pixel, grayscale, red, green, blue and transp fields. - -Visuals describe how color information is encoded and assembled to create -macropixels. Types describe how macropixels are stored in memory. The following -types and visuals are supported. - -- FB_TYPE_PACKED_PIXELS - -Macropixels are stored contiguously in a single plane. If the number of bits -per macropixel is not a multiple of 8, whether macropixels are padded to the -next multiple of 8 bits or packed together into bytes depends on the visual. - -Padding at end of lines may be present and is then reported through the fixed -screen information line_length field. - -- FB_TYPE_PLANES - -Macropixels are split across multiple planes. The number of planes is equal to -the number of bits per macropixel, with plane i'th storing i'th bit from all -macropixels. - -Planes are located contiguously in memory. - -- FB_TYPE_INTERLEAVED_PLANES - -Macropixels are split across multiple planes. The number of planes is equal to -the number of bits per macropixel, with plane i'th storing i'th bit from all -macropixels. - -Planes are interleaved in memory. The interleave factor, defined as the -distance in bytes between the beginning of two consecutive interleaved blocks -belonging to different planes, is stored in the fixed screen information -type_aux field. - -- FB_TYPE_FOURCC - -Macropixels are stored in memory as described by the format FOURCC identifier -stored in the variable screen information grayscale field. - -- FB_VISUAL_MONO01 - -Pixels are black or white and stored on a number of bits (typically one) -specified by the variable screen information bpp field. - -Black pixels are represented by all bits set to 1 and white pixels by all bits -set to 0. When the number of bits per pixel is smaller than 8, several pixels -are packed together in a byte. - -FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only. - -- FB_VISUAL_MONO10 - -Pixels are black or white and stored on a number of bits (typically one) -specified by the variable screen information bpp field. - -Black pixels are represented by all bits set to 0 and white pixels by all bits -set to 1. When the number of bits per pixel is smaller than 8, several pixels -are packed together in a byte. - -FB_VISUAL_MONO01 is currently used with FB_TYPE_PACKED_PIXELS only. - -- FB_VISUAL_TRUECOLOR - -Pixels are broken into red, green and blue components, and each component -indexes a read-only lookup table for the corresponding value. Lookup tables -are device-dependent, and provide linear or non-linear ramps. - -Each component is stored in a macropixel according to the variable screen -information red, green, blue and transp fields. - -- FB_VISUAL_PSEUDOCOLOR and FB_VISUAL_STATIC_PSEUDOCOLOR - -Pixel values are encoded as indices into a colormap that stores red, green and -blue components. The colormap is read-only for FB_VISUAL_STATIC_PSEUDOCOLOR -and read-write for FB_VISUAL_PSEUDOCOLOR. - -Each pixel value is stored in the number of bits reported by the variable -screen information bits_per_pixel field. - -- FB_VISUAL_DIRECTCOLOR - -Pixels are broken into red, green and blue components, and each component -indexes a programmable lookup table for the corresponding value. - -Each component is stored in a macropixel according to the variable screen -information red, green, blue and transp fields. - -- FB_VISUAL_FOURCC - -Pixels are encoded and interpreted as described by the format FOURCC -identifier stored in the variable screen information grayscale field. - - -3. Screen information ---------------------- - -Screen information are queried by applications using the FBIOGET_FSCREENINFO -and FBIOGET_VSCREENINFO ioctls. Those ioctls take a pointer to a -fb_fix_screeninfo and fb_var_screeninfo structure respectively. - -struct fb_fix_screeninfo stores device independent unchangeable information -about the frame buffer device and the current format. Those information can't -be directly modified by applications, but can be changed by the driver when an -application modifies the format. - -struct fb_fix_screeninfo { - char id[16]; /* identification string eg "TT Builtin" */ - unsigned long smem_start; /* Start of frame buffer mem */ - /* (physical address) */ - __u32 smem_len; /* Length of frame buffer mem */ - __u32 type; /* see FB_TYPE_* */ - __u32 type_aux; /* Interleave for interleaved Planes */ - __u32 visual; /* see FB_VISUAL_* */ - __u16 xpanstep; /* zero if no hardware panning */ - __u16 ypanstep; /* zero if no hardware panning */ - __u16 ywrapstep; /* zero if no hardware ywrap */ - __u32 line_length; /* length of a line in bytes */ - unsigned long mmio_start; /* Start of Memory Mapped I/O */ - /* (physical address) */ - __u32 mmio_len; /* Length of Memory Mapped I/O */ - __u32 accel; /* Indicate to driver which */ - /* specific chip/card we have */ - __u16 capabilities; /* see FB_CAP_* */ - __u16 reserved[2]; /* Reserved for future compatibility */ -}; - -struct fb_var_screeninfo stores device independent changeable information -about a frame buffer device, its current format and video mode, as well as -other miscellaneous parameters. - -struct fb_var_screeninfo { - __u32 xres; /* visible resolution */ - __u32 yres; - __u32 xres_virtual; /* virtual resolution */ - __u32 yres_virtual; - __u32 xoffset; /* offset from virtual to visible */ - __u32 yoffset; /* resolution */ - - __u32 bits_per_pixel; /* guess what */ - __u32 grayscale; /* 0 = color, 1 = grayscale, */ - /* >1 = FOURCC */ - struct fb_bitfield red; /* bitfield in fb mem if true color, */ - struct fb_bitfield green; /* else only length is significant */ - struct fb_bitfield blue; - struct fb_bitfield transp; /* transparency */ - - __u32 nonstd; /* != 0 Non standard pixel format */ - - __u32 activate; /* see FB_ACTIVATE_* */ - - __u32 height; /* height of picture in mm */ - __u32 width; /* width of picture in mm */ - - __u32 accel_flags; /* (OBSOLETE) see fb_info.flags */ - - /* Timing: All values in pixclocks, except pixclock (of course) */ - __u32 pixclock; /* pixel clock in ps (pico seconds) */ - __u32 left_margin; /* time from sync to picture */ - __u32 right_margin; /* time from picture to sync */ - __u32 upper_margin; /* time from sync to picture */ - __u32 lower_margin; - __u32 hsync_len; /* length of horizontal sync */ - __u32 vsync_len; /* length of vertical sync */ - __u32 sync; /* see FB_SYNC_* */ - __u32 vmode; /* see FB_VMODE_* */ - __u32 rotate; /* angle we rotate counter clockwise */ - __u32 colorspace; /* colorspace for FOURCC-based modes */ - __u32 reserved[4]; /* Reserved for future compatibility */ -}; - -To modify variable information, applications call the FBIOPUT_VSCREENINFO -ioctl with a pointer to a fb_var_screeninfo structure. If the call is -successful, the driver will update the fixed screen information accordingly. - -Instead of filling the complete fb_var_screeninfo structure manually, -applications should call the FBIOGET_VSCREENINFO ioctl and modify only the -fields they care about. - - -4. Format configuration ------------------------ - -Frame buffer devices offer two ways to configure the frame buffer format: the -legacy API and the FOURCC-based API. - - -The legacy API has been the only frame buffer format configuration API for a -long time and is thus widely used by application. It is the recommended API -for applications when using RGB and grayscale formats, as well as legacy -non-standard formats. - -To select a format, applications set the fb_var_screeninfo bits_per_pixel field -to the desired frame buffer depth. Values up to 8 will usually map to -monochrome, grayscale or pseudocolor visuals, although this is not required. - -- For grayscale formats, applications set the grayscale field to one. The red, - blue, green and transp fields must be set to 0 by applications and ignored by - drivers. Drivers must fill the red, blue and green offsets to 0 and lengths - to the bits_per_pixel value. - -- For pseudocolor formats, applications set the grayscale field to zero. The - red, blue, green and transp fields must be set to 0 by applications and - ignored by drivers. Drivers must fill the red, blue and green offsets to 0 - and lengths to the bits_per_pixel value. - -- For truecolor and directcolor formats, applications set the grayscale field - to zero, and the red, blue, green and transp fields to describe the layout of - color components in memory. - -struct fb_bitfield { - __u32 offset; /* beginning of bitfield */ - __u32 length; /* length of bitfield */ - __u32 msb_right; /* != 0 : Most significant bit is */ - /* right */ -}; - - Pixel values are bits_per_pixel wide and are split in non-overlapping red, - green, blue and alpha (transparency) components. Location and size of each - component in the pixel value are described by the fb_bitfield offset and - length fields. Offset are computed from the right. - - Pixels are always stored in an integer number of bytes. If the number of - bits per pixel is not a multiple of 8, pixel values are padded to the next - multiple of 8 bits. - -Upon successful format configuration, drivers update the fb_fix_screeninfo -type, visual and line_length fields depending on the selected format. - - -The FOURCC-based API replaces format descriptions by four character codes -(FOURCC). FOURCCs are abstract identifiers that uniquely define a format -without explicitly describing it. This is the only API that supports YUV -formats. Drivers are also encouraged to implement the FOURCC-based API for RGB -and grayscale formats. - -Drivers that support the FOURCC-based API report this capability by setting -the FB_CAP_FOURCC bit in the fb_fix_screeninfo capabilities field. - -FOURCC definitions are located in the linux/videodev2.h header. However, and -despite starting with the V4L2_PIX_FMT_prefix, they are not restricted to V4L2 -and don't require usage of the V4L2 subsystem. FOURCC documentation is -available in Documentation/media/uapi/v4l/pixfmt.rst. - -To select a format, applications set the grayscale field to the desired FOURCC. -For YUV formats, they should also select the appropriate colorspace by setting -the colorspace field to one of the colorspaces listed in linux/videodev2.h and -documented in Documentation/media/uapi/v4l/colorspaces.rst. - -The red, green, blue and transp fields are not used with the FOURCC-based API. -For forward compatibility reasons applications must zero those fields, and -drivers must ignore them. Values other than 0 may get a meaning in future -extensions. - -Upon successful format configuration, drivers update the fb_fix_screeninfo -type, visual and line_length fields depending on the selected format. The type -and visual fields are set to FB_TYPE_FOURCC and FB_VISUAL_FOURCC respectively. diff --git a/Documentation/fb/arkfb.rst b/Documentation/fb/arkfb.rst new file mode 100644 index 000000000000..aeca8773dd7e --- /dev/null +++ b/Documentation/fb/arkfb.rst @@ -0,0 +1,68 @@ +======================================== +arkfb - fbdev driver for ARK Logic chips +======================================== + + +Supported Hardware +================== + + ARK 2000PV chip + ICS 5342 ramdac + + - only BIOS initialized VGA devices supported + - probably not working on big endian + + +Supported Features +================== + + * 4 bpp pseudocolor modes (with 18bit palette, two variants) + * 8 bpp pseudocolor mode (with 18bit palette) + * 16 bpp truecolor modes (RGB 555 and RGB 565) + * 24 bpp truecolor mode (RGB 888) + * 32 bpp truecolor mode (RGB 888) + * text mode (activated by bpp = 0) + * doublescan mode variant (not available in text mode) + * panning in both directions + * suspend/resume support + +Text mode is supported even in higher resolutions, but there is limitation to +lower pixclocks (i got maximum about 70 MHz, it is dependent on specific +hardware). This limitation is not enforced by driver. Text mode supports 8bit +wide fonts only (hardware limitation) and 16bit tall fonts (driver +limitation). Unfortunately character attributes (like color) in text mode are +broken for unknown reason, so its usefulness is limited. + +There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with +packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode +with interleaved planes (1 byte interleave), MSB first. Both modes support +8bit wide fonts only (driver limitation). + +Suspend/resume works on systems that initialize video card during resume and +if device is active (for example used by fbcon). + + +Missing Features +================ +(alias TODO list) + + * secondary (not initialized by BIOS) device support + * big endian support + * DPMS support + * MMIO support + * interlaced mode variant + * support for fontwidths != 8 in 4 bpp modes + * support for fontheight != 16 in text mode + * hardware cursor + * vsync synchronization + * feature connector support + * acceleration support (8514-like 2D) + + +Known bugs +========== + + * character attributes (and cursor) in text mode are broken + +-- +Ondrej Zajicek diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.txt deleted file mode 100644 index e8487a9d6a05..000000000000 --- a/Documentation/fb/arkfb.txt +++ /dev/null @@ -1,68 +0,0 @@ - - arkfb - fbdev driver for ARK Logic chips - ======================================== - - -Supported Hardware -================== - - ARK 2000PV chip - ICS 5342 ramdac - - - only BIOS initialized VGA devices supported - - probably not working on big endian - - -Supported Features -================== - - * 4 bpp pseudocolor modes (with 18bit palette, two variants) - * 8 bpp pseudocolor mode (with 18bit palette) - * 16 bpp truecolor modes (RGB 555 and RGB 565) - * 24 bpp truecolor mode (RGB 888) - * 32 bpp truecolor mode (RGB 888) - * text mode (activated by bpp = 0) - * doublescan mode variant (not available in text mode) - * panning in both directions - * suspend/resume support - -Text mode is supported even in higher resolutions, but there is limitation to -lower pixclocks (i got maximum about 70 MHz, it is dependent on specific -hardware). This limitation is not enforced by driver. Text mode supports 8bit -wide fonts only (hardware limitation) and 16bit tall fonts (driver -limitation). Unfortunately character attributes (like color) in text mode are -broken for unknown reason, so its usefulness is limited. - -There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with -packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode -with interleaved planes (1 byte interleave), MSB first. Both modes support -8bit wide fonts only (driver limitation). - -Suspend/resume works on systems that initialize video card during resume and -if device is active (for example used by fbcon). - - -Missing Features -================ -(alias TODO list) - - * secondary (not initialized by BIOS) device support - * big endian support - * DPMS support - * MMIO support - * interlaced mode variant - * support for fontwidths != 8 in 4 bpp modes - * support for fontheight != 16 in text mode - * hardware cursor - * vsync synchronization - * feature connector support - * acceleration support (8514-like 2D) - - -Known bugs -========== - - * character attributes (and cursor) in text mode are broken - --- -Ondrej Zajicek diff --git a/Documentation/fb/aty128fb.rst b/Documentation/fb/aty128fb.rst new file mode 100644 index 000000000000..3f107718f933 --- /dev/null +++ b/Documentation/fb/aty128fb.rst @@ -0,0 +1,75 @@ +================= +What is aty128fb? +================= + +.. [This file is cloned from VesaFB/matroxfb] + +This is a driver for a graphic framebuffer for ATI Rage128 based devices +on Intel and PPC boxes. + +Advantages: + + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF68_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... but you should not notice + if you use same resolution as you used in textmode. + * still experimental. + + +How to use it? +============== + +Switching modes is done using the video=aty128fb:... modedb +boot parameter or using `fbset` program. + +See Documentation/fb/modedb.rst for more information on modedb +resolutions. + +You should compile in both vgacon (to boot if you remove your Rage128 from +box) and aty128fb (for graphics mode). You should not compile-in vesafb +unless you have primary display on non-Rage128 VBE2.0 device (see +Documentation/fb/vesafb.rst for details). + + +X11 +=== + +XF68_FBDev should generally work fine, but it is non-accelerated. As of +this document, 8 and 32bpp works fine. There have been palette issues +when switching from X to console and back to X. You will have to restart +X to fix this. + + +Configuration +============= + +You can pass kernel command line options to vesafb with +`video=aty128fb:option1,option2:value2,option3` (multiple options should +be separated by comma, values are separated from options by `:`). +Accepted options: + +========= ======================================================= +noaccel do not use acceleration engine. It is default. +accel use acceleration engine. Not finished. +vmode:x chooses PowerMacintosh video mode . Deprecated. +cmode:x chooses PowerMacintosh colour mode . Deprecated. + selects startup videomode. See modedb.txt for detailed + explanation. Default is 640x480x8bpp. +========= ======================================================= + + +Limitations +=========== + +There are known and unknown bugs, features and misfeatures. +Currently there are following known bugs: + + - This driver is still experimental and is not finished. Too many + bugs/errata to list here. + +Brad Douglas diff --git a/Documentation/fb/aty128fb.txt b/Documentation/fb/aty128fb.txt deleted file mode 100644 index b605204fcfe1..000000000000 --- a/Documentation/fb/aty128fb.txt +++ /dev/null @@ -1,72 +0,0 @@ -[This file is cloned from VesaFB/matroxfb] - -What is aty128fb? -================= - -This is a driver for a graphic framebuffer for ATI Rage128 based devices -on Intel and PPC boxes. - -Advantages: - - * It provides a nice large console (128 cols + 48 lines with 1024x768) - without using tiny, unreadable fonts. - * You can run XF68_FBDev on top of /dev/fb0 - * Most important: boot logo :-) - -Disadvantages: - - * graphic mode is slower than text mode... but you should not notice - if you use same resolution as you used in textmode. - * still experimental. - - -How to use it? -============== - -Switching modes is done using the video=aty128fb:... modedb -boot parameter or using `fbset' program. - -See Documentation/fb/modedb.txt for more information on modedb -resolutions. - -You should compile in both vgacon (to boot if you remove your Rage128 from -box) and aty128fb (for graphics mode). You should not compile-in vesafb -unless you have primary display on non-Rage128 VBE2.0 device (see -Documentation/fb/vesafb.txt for details). - - -X11 -=== - -XF68_FBDev should generally work fine, but it is non-accelerated. As of -this document, 8 and 32bpp works fine. There have been palette issues -when switching from X to console and back to X. You will have to restart -X to fix this. - - -Configuration -============= - -You can pass kernel command line options to vesafb with -`video=aty128fb:option1,option2:value2,option3' (multiple options should -be separated by comma, values are separated from options by `:'). -Accepted options: - -noaccel - do not use acceleration engine. It is default. -accel - use acceleration engine. Not finished. -vmode:x - chooses PowerMacintosh video mode . Deprecated. -cmode:x - chooses PowerMacintosh colour mode . Deprecated. - - selects startup videomode. See modedb.txt for detailed - explanation. Default is 640x480x8bpp. - - -Limitations -=========== - -There are known and unknown bugs, features and misfeatures. -Currently there are following known bugs: - + This driver is still experimental and is not finished. Too many - bugs/errata to list here. - --- -Brad Douglas diff --git a/Documentation/fb/cirrusfb.rst b/Documentation/fb/cirrusfb.rst new file mode 100644 index 000000000000..8c3e6c6cb114 --- /dev/null +++ b/Documentation/fb/cirrusfb.rst @@ -0,0 +1,94 @@ +============================================ +Framebuffer driver for Cirrus Logic chipsets +============================================ + +Copyright 1999 Jeff Garzik + + +.. just a little something to get people going; contributors welcome! + + +Chip families supported: + - SD64 + - Piccolo + - Picasso + - Spectrum + - Alpine (GD-543x/4x) + - Picasso4 (GD-5446) + - GD-5480 + - Laguna (GD-546x) + +Bus's supported: + - PCI + - Zorro + +Architectures supported: + - i386 + - Alpha + - PPC (Motorola Powerstack) + - m68k (Amiga) + + + +Default video modes +------------------- +At the moment, there are two kernel command line arguments supported: + +- mode:640x480 +- mode:800x600 +- mode:1024x768 + +Full support for startup video modes (modedb) will be integrated soon. + +Version 1.9.9.1 +--------------- +* Fix memory detection for 512kB case +* 800x600 mode +* Fixed timings +* Hint for AXP: Use -accel false -vyres -1 when changing resolution + + +Version 1.9.4.4 +--------------- +* Preliminary Laguna support +* Overhaul color register routines. +* Associated with the above, console colors are now obtained from a LUT + called 'palette' instead of from the VGA registers. This code was + modelled after that in atyfb and matroxfb. +* Code cleanup, add comments. +* Overhaul SR07 handling. +* Bug fixes. + + +Version 1.9.4.3 +--------------- +* Correctly set default startup video mode. +* Do not override ram size setting. Define + CLGEN_USE_HARDCODED_RAM_SETTINGS if you _do_ want to override the RAM + setting. +* Compile fixes related to new 2.3.x IORESOURCE_IO[PORT] symbol changes. +* Use new 2.3.x resource allocation. +* Some code cleanup. + + +Version 1.9.4.2 +--------------- +* Casting fixes. +* Assertions no longer cause an oops on purpose. +* Bug fixes. + + +Version 1.9.4.1 +--------------- +* Add compatibility support. Now requires a 2.1.x, 2.2.x or 2.3.x kernel. + + +Version 1.9.4 +------------- +* Several enhancements, smaller memory footprint, a few bugfixes. +* Requires kernel 2.3.14-pre1 or later. + + +Version 1.9.3 +------------- +* Bundled with kernel 2.3.14-pre1 or later. diff --git a/Documentation/fb/cirrusfb.txt b/Documentation/fb/cirrusfb.txt deleted file mode 100644 index f75950d330a4..000000000000 --- a/Documentation/fb/cirrusfb.txt +++ /dev/null @@ -1,97 +0,0 @@ - - Framebuffer driver for Cirrus Logic chipsets - Copyright 1999 Jeff Garzik - - - -{ just a little something to get people going; contributors welcome! } - - - -Chip families supported: - SD64 - Piccolo - Picasso - Spectrum - Alpine (GD-543x/4x) - Picasso4 (GD-5446) - GD-5480 - Laguna (GD-546x) - -Bus's supported: - PCI - Zorro - -Architectures supported: - i386 - Alpha - PPC (Motorola Powerstack) - m68k (Amiga) - - - -Default video modes -------------------- -At the moment, there are two kernel command line arguments supported: - -mode:640x480 -mode:800x600 - or -mode:1024x768 - -Full support for startup video modes (modedb) will be integrated soon. - -Version 1.9.9.1 ---------------- -* Fix memory detection for 512kB case -* 800x600 mode -* Fixed timings -* Hint for AXP: Use -accel false -vyres -1 when changing resolution - - -Version 1.9.4.4 ---------------- -* Preliminary Laguna support -* Overhaul color register routines. -* Associated with the above, console colors are now obtained from a LUT - called 'palette' instead of from the VGA registers. This code was - modelled after that in atyfb and matroxfb. -* Code cleanup, add comments. -* Overhaul SR07 handling. -* Bug fixes. - - -Version 1.9.4.3 ---------------- -* Correctly set default startup video mode. -* Do not override ram size setting. Define - CLGEN_USE_HARDCODED_RAM_SETTINGS if you _do_ want to override the RAM - setting. -* Compile fixes related to new 2.3.x IORESOURCE_IO[PORT] symbol changes. -* Use new 2.3.x resource allocation. -* Some code cleanup. - - -Version 1.9.4.2 ---------------- -* Casting fixes. -* Assertions no longer cause an oops on purpose. -* Bug fixes. - - -Version 1.9.4.1 ---------------- -* Add compatibility support. Now requires a 2.1.x, 2.2.x or 2.3.x kernel. - - -Version 1.9.4 -------------- -* Several enhancements, smaller memory footprint, a few bugfixes. -* Requires kernel 2.3.14-pre1 or later. - - -Version 1.9.3 -------------- -* Bundled with kernel 2.3.14-pre1 or later. - - diff --git a/Documentation/fb/cmap_xfbdev.rst b/Documentation/fb/cmap_xfbdev.rst new file mode 100644 index 000000000000..5db5e9787361 --- /dev/null +++ b/Documentation/fb/cmap_xfbdev.rst @@ -0,0 +1,56 @@ +========================== +Understanding fbdev's cmap +========================== + +These notes explain how X's dix layer uses fbdev's cmap structures. + +- example of relevant structures in fbdev as used for a 3-bit grayscale cmap:: + + struct fb_var_screeninfo { + .bits_per_pixel = 8, + .grayscale = 1, + .red = { 4, 3, 0 }, + .green = { 0, 0, 0 }, + .blue = { 0, 0, 0 }, + } + struct fb_fix_screeninfo { + .visual = FB_VISUAL_STATIC_PSEUDOCOLOR, + } + for (i = 0; i < 8; i++) + info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16; + memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8); + memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8); + +- X11 apps do something like the following when trying to use grayscale:: + + for (i=0; i < 8; i++) { + char colorspec[64]; + memset(colorspec,0,64); + sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36); + if (!XParseColor(outputDisplay, testColormap, colorspec, &wantedColor)) + printf("Can't get color %s\n",colorspec); + XAllocColor(outputDisplay, testColormap, &wantedColor); + grays[i] = wantedColor; + } + +There's also named equivalents like gray1..x provided you have an rgb.txt. + +Somewhere in X's callchain, this results in a call to X code that handles the +colormap. For example, Xfbdev hits the following: + +xc-011010/programs/Xserver/dix/colormap.c:: + + FindBestPixel(pentFirst, size, prgb, channel) + + dr = (long) pent->co.local.red - prgb->red; + dg = (long) pent->co.local.green - prgb->green; + db = (long) pent->co.local.blue - prgb->blue; + sq = dr * dr; + UnsignedToBigNum (sq, &sum); + BigNumAdd (&sum, &temp, &sum); + +co.local.red are entries that were brought in through FBIOGETCMAP which come +directly from the info->cmap.red that was listed above. The prgb is the rgb +that the app wants to match to. The above code is doing what looks like a least +squares matching function. That's why the cmap entries can't be set to the left +hand side boundaries of a color range. diff --git a/Documentation/fb/cmap_xfbdev.txt b/Documentation/fb/cmap_xfbdev.txt deleted file mode 100644 index 55e1f0a3d2b4..000000000000 --- a/Documentation/fb/cmap_xfbdev.txt +++ /dev/null @@ -1,53 +0,0 @@ -Understanding fbdev's cmap --------------------------- - -These notes explain how X's dix layer uses fbdev's cmap structures. - -*. example of relevant structures in fbdev as used for a 3-bit grayscale cmap -struct fb_var_screeninfo { - .bits_per_pixel = 8, - .grayscale = 1, - .red = { 4, 3, 0 }, - .green = { 0, 0, 0 }, - .blue = { 0, 0, 0 }, -} -struct fb_fix_screeninfo { - .visual = FB_VISUAL_STATIC_PSEUDOCOLOR, -} -for (i = 0; i < 8; i++) - info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16; -memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8); -memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8); - -*. X11 apps do something like the following when trying to use grayscale. -for (i=0; i < 8; i++) { - char colorspec[64]; - memset(colorspec,0,64); - sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36); - if (!XParseColor(outputDisplay, testColormap, colorspec, &wantedColor)) - printf("Can't get color %s\n",colorspec); - XAllocColor(outputDisplay, testColormap, &wantedColor); - grays[i] = wantedColor; -} -There's also named equivalents like gray1..x provided you have an rgb.txt. - -Somewhere in X's callchain, this results in a call to X code that handles the -colormap. For example, Xfbdev hits the following: - -xc-011010/programs/Xserver/dix/colormap.c: - -FindBestPixel(pentFirst, size, prgb, channel) - -dr = (long) pent->co.local.red - prgb->red; -dg = (long) pent->co.local.green - prgb->green; -db = (long) pent->co.local.blue - prgb->blue; -sq = dr * dr; -UnsignedToBigNum (sq, &sum); -BigNumAdd (&sum, &temp, &sum); - -co.local.red are entries that were brought in through FBIOGETCMAP which come -directly from the info->cmap.red that was listed above. The prgb is the rgb -that the app wants to match to. The above code is doing what looks like a least -squares matching function. That's why the cmap entries can't be set to the left -hand side boundaries of a color range. - diff --git a/Documentation/fb/deferred_io.rst b/Documentation/fb/deferred_io.rst new file mode 100644 index 000000000000..7300cff255a3 --- /dev/null +++ b/Documentation/fb/deferred_io.rst @@ -0,0 +1,79 @@ +=========== +Deferred IO +=========== + +Deferred IO is a way to delay and repurpose IO. It uses host memory as a +buffer and the MMU pagefault as a pretrigger for when to perform the device +IO. The following example may be a useful explanation of how one such setup +works: + +- userspace app like Xfbdev mmaps framebuffer +- deferred IO and driver sets up fault and page_mkwrite handlers +- userspace app tries to write to mmaped vaddress +- we get pagefault and reach fault handler +- fault handler finds and returns physical page +- we get page_mkwrite where we add this page to a list +- schedule a workqueue task to be run after a delay +- app continues writing to that page with no additional cost. this is + the key benefit. +- the workqueue task comes in and mkcleans the pages on the list, then + completes the work associated with updating the framebuffer. this is + the real work talking to the device. +- app tries to write to the address (that has now been mkcleaned) +- get pagefault and the above sequence occurs again + +As can be seen from above, one benefit is roughly to allow bursty framebuffer +writes to occur at minimum cost. Then after some time when hopefully things +have gone quiet, we go and really update the framebuffer which would be +a relatively more expensive operation. + +For some types of nonvolatile high latency displays, the desired image is +the final image rather than the intermediate stages which is why it's okay +to not update for each write that is occurring. + +It may be the case that this is useful in other scenarios as well. Paul Mundt +has mentioned a case where it is beneficial to use the page count to decide +whether to coalesce and issue SG DMA or to do memory bursts. + +Another one may be if one has a device framebuffer that is in an usual format, +say diagonally shifting RGB, this may then be a mechanism for you to allow +apps to pretend to have a normal framebuffer but reswizzle for the device +framebuffer at vsync time based on the touched pagelist. + +How to use it: (for applications) +--------------------------------- +No changes needed. mmap the framebuffer like normal and just use it. + +How to use it: (for fbdev drivers) +---------------------------------- +The following example may be helpful. + +1. Setup your structure. Eg:: + + static struct fb_deferred_io hecubafb_defio = { + .delay = HZ, + .deferred_io = hecubafb_dpy_deferred_io, + }; + +The delay is the minimum delay between when the page_mkwrite trigger occurs +and when the deferred_io callback is called. The deferred_io callback is +explained below. + +2. Setup your deferred IO callback. Eg:: + + static void hecubafb_dpy_deferred_io(struct fb_info *info, + struct list_head *pagelist) + +The deferred_io callback is where you would perform all your IO to the display +device. You receive the pagelist which is the list of pages that were written +to during the delay. You must not modify this list. This callback is called +from a workqueue. + +3. Call init:: + + info->fbdefio = &hecubafb_defio; + fb_deferred_io_init(info); + +4. Call cleanup:: + + fb_deferred_io_cleanup(info); diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt deleted file mode 100644 index 748328370250..000000000000 --- a/Documentation/fb/deferred_io.txt +++ /dev/null @@ -1,75 +0,0 @@ -Deferred IO ------------ - -Deferred IO is a way to delay and repurpose IO. It uses host memory as a -buffer and the MMU pagefault as a pretrigger for when to perform the device -IO. The following example may be a useful explanation of how one such setup -works: - -- userspace app like Xfbdev mmaps framebuffer -- deferred IO and driver sets up fault and page_mkwrite handlers -- userspace app tries to write to mmaped vaddress -- we get pagefault and reach fault handler -- fault handler finds and returns physical page -- we get page_mkwrite where we add this page to a list -- schedule a workqueue task to be run after a delay -- app continues writing to that page with no additional cost. this is - the key benefit. -- the workqueue task comes in and mkcleans the pages on the list, then - completes the work associated with updating the framebuffer. this is - the real work talking to the device. -- app tries to write to the address (that has now been mkcleaned) -- get pagefault and the above sequence occurs again - -As can be seen from above, one benefit is roughly to allow bursty framebuffer -writes to occur at minimum cost. Then after some time when hopefully things -have gone quiet, we go and really update the framebuffer which would be -a relatively more expensive operation. - -For some types of nonvolatile high latency displays, the desired image is -the final image rather than the intermediate stages which is why it's okay -to not update for each write that is occurring. - -It may be the case that this is useful in other scenarios as well. Paul Mundt -has mentioned a case where it is beneficial to use the page count to decide -whether to coalesce and issue SG DMA or to do memory bursts. - -Another one may be if one has a device framebuffer that is in an usual format, -say diagonally shifting RGB, this may then be a mechanism for you to allow -apps to pretend to have a normal framebuffer but reswizzle for the device -framebuffer at vsync time based on the touched pagelist. - -How to use it: (for applications) ---------------------------------- -No changes needed. mmap the framebuffer like normal and just use it. - -How to use it: (for fbdev drivers) ----------------------------------- -The following example may be helpful. - -1. Setup your structure. Eg: - -static struct fb_deferred_io hecubafb_defio = { - .delay = HZ, - .deferred_io = hecubafb_dpy_deferred_io, -}; - -The delay is the minimum delay between when the page_mkwrite trigger occurs -and when the deferred_io callback is called. The deferred_io callback is -explained below. - -2. Setup your deferred IO callback. Eg: -static void hecubafb_dpy_deferred_io(struct fb_info *info, - struct list_head *pagelist) - -The deferred_io callback is where you would perform all your IO to the display -device. You receive the pagelist which is the list of pages that were written -to during the delay. You must not modify this list. This callback is called -from a workqueue. - -3. Call init - info->fbdefio = &hecubafb_defio; - fb_deferred_io_init(info); - -4. Call cleanup - fb_deferred_io_cleanup(info); diff --git a/Documentation/fb/efifb.rst b/Documentation/fb/efifb.rst new file mode 100644 index 000000000000..04840331a00e --- /dev/null +++ b/Documentation/fb/efifb.rst @@ -0,0 +1,39 @@ +============== +What is efifb? +============== + +This is a generic EFI platform driver for Intel based Apple computers. +efifb is only for EFI booted Intel Macs. + +Supported Hardware +================== + +- iMac 17"/20" +- Macbook +- Macbook Pro 15"/17" +- MacMini + +How to use it? +============== + +efifb does not have any kind of autodetection of your machine. +You have to add the following kernel parameters in your elilo.conf:: + + Macbook : + video=efifb:macbook + MacMini : + video=efifb:mini + Macbook Pro 15", iMac 17" : + video=efifb:i17 + Macbook Pro 17", iMac 20" : + video=efifb:i20 + +Accepted options: + +======= =========================================================== +nowc Don't map the framebuffer write combined. This can be used + to workaround side-effects and slowdowns on other CPU cores + when large amounts of console data are written. +======= =========================================================== + +Edgar Hucek diff --git a/Documentation/fb/efifb.txt b/Documentation/fb/efifb.txt deleted file mode 100644 index 1a85c1bdaf38..000000000000 --- a/Documentation/fb/efifb.txt +++ /dev/null @@ -1,37 +0,0 @@ - -What is efifb? -=============== - -This is a generic EFI platform driver for Intel based Apple computers. -efifb is only for EFI booted Intel Macs. - -Supported Hardware -================== - -iMac 17"/20" -Macbook -Macbook Pro 15"/17" -MacMini - -How to use it? -============== - -efifb does not have any kind of autodetection of your machine. -You have to add the following kernel parameters in your elilo.conf: - Macbook : - video=efifb:macbook - MacMini : - video=efifb:mini - Macbook Pro 15", iMac 17" : - video=efifb:i17 - Macbook Pro 17", iMac 20" : - video=efifb:i20 - -Accepted options: - -nowc Don't map the framebuffer write combined. This can be used - to workaround side-effects and slowdowns on other CPU cores - when large amounts of console data are written. - --- -Edgar Hucek diff --git a/Documentation/fb/ep93xx-fb.rst b/Documentation/fb/ep93xx-fb.rst new file mode 100644 index 000000000000..6f7767926d1a --- /dev/null +++ b/Documentation/fb/ep93xx-fb.rst @@ -0,0 +1,140 @@ +================================ +Driver for EP93xx LCD controller +================================ + +The EP93xx LCD controller can drive both standard desktop monitors and +embedded LCD displays. If you have a standard desktop monitor then you +can use the standard Linux video mode database. In your board file:: + + static struct ep93xxfb_mach_info some_board_fb_info = { + .num_modes = EP93XXFB_USE_MODEDB, + .bpp = 16, + }; + +If you have an embedded LCD display then you need to define a video +mode for it as follows:: + + static struct fb_videomode some_board_video_modes[] = { + { + .name = "some_lcd_name", + /* Pixel clock, porches, etc */ + }, + }; + +Note that the pixel clock value is in pico-seconds. You can use the +KHZ2PICOS macro to convert the pixel clock value. Most other values +are in pixel clocks. See Documentation/fb/framebuffer.rst for further +details. + +The ep93xxfb_mach_info structure for your board should look like the +following:: + + static struct ep93xxfb_mach_info some_board_fb_info = { + .num_modes = ARRAY_SIZE(some_board_video_modes), + .modes = some_board_video_modes, + .default_mode = &some_board_video_modes[0], + .bpp = 16, + }; + +The framebuffer device can be registered by adding the following to +your board initialisation function:: + + ep93xx_register_fb(&some_board_fb_info); + +===================== +Video Attribute Flags +===================== + +The ep93xxfb_mach_info structure has a flags field which can be used +to configure the controller. The video attributes flags are fully +documented in section 7 of the EP93xx users' guide. The following +flags are available: + +=============================== ========================================== +EP93XXFB_PCLK_FALLING Clock data on the falling edge of the + pixel clock. The default is to clock + data on the rising edge. + +EP93XXFB_SYNC_BLANK_HIGH Blank signal is active high. By + default the blank signal is active low. + +EP93XXFB_SYNC_HORIZ_HIGH Horizontal sync is active high. By + default the horizontal sync is active low. + +EP93XXFB_SYNC_VERT_HIGH Vertical sync is active high. By + default the vertical sync is active high. +=============================== ========================================== + +The physical address of the framebuffer can be controlled using the +following flags: + +=============================== ====================================== +EP93XXFB_USE_SDCSN0 Use SDCSn[0] for the framebuffer. This + is the default setting. + +EP93XXFB_USE_SDCSN1 Use SDCSn[1] for the framebuffer. + +EP93XXFB_USE_SDCSN2 Use SDCSn[2] for the framebuffer. + +EP93XXFB_USE_SDCSN3 Use SDCSn[3] for the framebuffer. +=============================== ====================================== + +================== +Platform callbacks +================== + +The EP93xx framebuffer driver supports three optional platform +callbacks: setup, teardown and blank. The setup and teardown functions +are called when the framebuffer driver is installed and removed +respectively. The blank function is called whenever the display is +blanked or unblanked. + +The setup and teardown devices pass the platform_device structure as +an argument. The fb_info and ep93xxfb_mach_info structures can be +obtained as follows:: + + static int some_board_fb_setup(struct platform_device *pdev) + { + struct ep93xxfb_mach_info *mach_info = pdev->dev.platform_data; + struct fb_info *fb_info = platform_get_drvdata(pdev); + + /* Board specific framebuffer setup */ + } + +====================== +Setting the video mode +====================== + +The video mode is set using the following syntax:: + + video=XRESxYRES[-BPP][@REFRESH] + +If the EP93xx video driver is built-in then the video mode is set on +the Linux kernel command line, for example:: + + video=ep93xx-fb:800x600-16@60 + +If the EP93xx video driver is built as a module then the video mode is +set when the module is installed:: + + modprobe ep93xx-fb video=320x240 + +============== +Screenpage bug +============== + +At least on the EP9315 there is a silicon bug which causes bit 27 of +the VIDSCRNPAGE (framebuffer physical offset) to be tied low. There is +an unofficial errata for this bug at:: + + http://marc.info/?l=linux-arm-kernel&m=110061245502000&w=2 + +By default the EP93xx framebuffer driver checks if the allocated physical +address has bit 27 set. If it does, then the memory is freed and an +error is returned. The check can be disabled by adding the following +option when loading the driver:: + + ep93xx-fb.check_screenpage_bug=0 + +In some cases it may be possible to reconfigure your SDRAM layout to +avoid this bug. See section 13 of the EP93xx users' guide for details. diff --git a/Documentation/fb/ep93xx-fb.txt b/Documentation/fb/ep93xx-fb.txt deleted file mode 100644 index 5af1bd9effae..000000000000 --- a/Documentation/fb/ep93xx-fb.txt +++ /dev/null @@ -1,135 +0,0 @@ -================================ -Driver for EP93xx LCD controller -================================ - -The EP93xx LCD controller can drive both standard desktop monitors and -embedded LCD displays. If you have a standard desktop monitor then you -can use the standard Linux video mode database. In your board file: - - static struct ep93xxfb_mach_info some_board_fb_info = { - .num_modes = EP93XXFB_USE_MODEDB, - .bpp = 16, - }; - -If you have an embedded LCD display then you need to define a video -mode for it as follows: - - static struct fb_videomode some_board_video_modes[] = { - { - .name = "some_lcd_name", - /* Pixel clock, porches, etc */ - }, - }; - -Note that the pixel clock value is in pico-seconds. You can use the -KHZ2PICOS macro to convert the pixel clock value. Most other values -are in pixel clocks. See Documentation/fb/framebuffer.txt for further -details. - -The ep93xxfb_mach_info structure for your board should look like the -following: - - static struct ep93xxfb_mach_info some_board_fb_info = { - .num_modes = ARRAY_SIZE(some_board_video_modes), - .modes = some_board_video_modes, - .default_mode = &some_board_video_modes[0], - .bpp = 16, - }; - -The framebuffer device can be registered by adding the following to -your board initialisation function: - - ep93xx_register_fb(&some_board_fb_info); - -===================== -Video Attribute Flags -===================== - -The ep93xxfb_mach_info structure has a flags field which can be used -to configure the controller. The video attributes flags are fully -documented in section 7 of the EP93xx users' guide. The following -flags are available: - -EP93XXFB_PCLK_FALLING Clock data on the falling edge of the - pixel clock. The default is to clock - data on the rising edge. - -EP93XXFB_SYNC_BLANK_HIGH Blank signal is active high. By - default the blank signal is active low. - -EP93XXFB_SYNC_HORIZ_HIGH Horizontal sync is active high. By - default the horizontal sync is active low. - -EP93XXFB_SYNC_VERT_HIGH Vertical sync is active high. By - default the vertical sync is active high. - -The physical address of the framebuffer can be controlled using the -following flags: - -EP93XXFB_USE_SDCSN0 Use SDCSn[0] for the framebuffer. This - is the default setting. - -EP93XXFB_USE_SDCSN1 Use SDCSn[1] for the framebuffer. - -EP93XXFB_USE_SDCSN2 Use SDCSn[2] for the framebuffer. - -EP93XXFB_USE_SDCSN3 Use SDCSn[3] for the framebuffer. - -================== -Platform callbacks -================== - -The EP93xx framebuffer driver supports three optional platform -callbacks: setup, teardown and blank. The setup and teardown functions -are called when the framebuffer driver is installed and removed -respectively. The blank function is called whenever the display is -blanked or unblanked. - -The setup and teardown devices pass the platform_device structure as -an argument. The fb_info and ep93xxfb_mach_info structures can be -obtained as follows: - - static int some_board_fb_setup(struct platform_device *pdev) - { - struct ep93xxfb_mach_info *mach_info = pdev->dev.platform_data; - struct fb_info *fb_info = platform_get_drvdata(pdev); - - /* Board specific framebuffer setup */ - } - -====================== -Setting the video mode -====================== - -The video mode is set using the following syntax: - - video=XRESxYRES[-BPP][@REFRESH] - -If the EP93xx video driver is built-in then the video mode is set on -the Linux kernel command line, for example: - - video=ep93xx-fb:800x600-16@60 - -If the EP93xx video driver is built as a module then the video mode is -set when the module is installed: - - modprobe ep93xx-fb video=320x240 - -============== -Screenpage bug -============== - -At least on the EP9315 there is a silicon bug which causes bit 27 of -the VIDSCRNPAGE (framebuffer physical offset) to be tied low. There is -an unofficial errata for this bug at: - http://marc.info/?l=linux-arm-kernel&m=110061245502000&w=2 - -By default the EP93xx framebuffer driver checks if the allocated physical -address has bit 27 set. If it does, then the memory is freed and an -error is returned. The check can be disabled by adding the following -option when loading the driver: - - ep93xx-fb.check_screenpage_bug=0 - -In some cases it may be possible to reconfigure your SDRAM layout to -avoid this bug. See section 13 of the EP93xx users' guide for details. diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst new file mode 100644 index 000000000000..cfb9f7c38f18 --- /dev/null +++ b/Documentation/fb/fbcon.rst @@ -0,0 +1,350 @@ +======================= +The Framebuffer Console +======================= + +The framebuffer console (fbcon), as its name implies, is a text +console running on top of the framebuffer device. It has the functionality of +any standard text console driver, such as the VGA console, with the added +features that can be attributed to the graphical nature of the framebuffer. + +In the x86 architecture, the framebuffer console is optional, and +some even treat it as a toy. For other architectures, it is the only available +display device, text or graphical. + +What are the features of fbcon? The framebuffer console supports +high resolutions, varying font types, display rotation, primitive multihead, +etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature +made available by the underlying graphics card are also possible. + +A. Configuration +================ + +The framebuffer console can be enabled by using your favorite kernel +configuration tool. It is under Device Drivers->Graphics Support->Frame +buffer Devices->Console display driver support->Framebuffer Console Support. +Select 'y' to compile support statically or 'm' for module support. The +module will be fbcon. + +In order for fbcon to activate, at least one framebuffer driver is +required, so choose from any of the numerous drivers available. For x86 +systems, they almost universally have VGA cards, so vga16fb and vesafb will +always be available. However, using a chipset-specific driver will give you +more speed and features, such as the ability to change the video mode +dynamically. + +To display the penguin logo, choose any logo available in Graphics +support->Bootup logo. + +Also, you will need to select at least one compiled-in font, but if +you don't do anything, the kernel configuration tool will select one for you, +usually an 8x16 font. + +GOTCHA: A common bug report is enabling the framebuffer without enabling the +framebuffer console. Depending on the driver, you may get a blanked or +garbled display, but the system still boots to completion. If you are +fortunate to have a driver that does not alter the graphics chip, then you +will still get a VGA console. + +B. Loading +========== + +Possible scenarios: + +1. Driver and fbcon are compiled statically + + Usually, fbcon will automatically take over your console. The notable + exception is vesafb. It needs to be explicitly activated with the + vga= boot option parameter. + +2. Driver is compiled statically, fbcon is compiled as a module + + Depending on the driver, you either get a standard console, or a + garbled display, as mentioned above. To get a framebuffer console, + do a 'modprobe fbcon'. + +3. Driver is compiled as a module, fbcon is compiled statically + + You get your standard console. Once the driver is loaded with + 'modprobe xxxfb', fbcon automatically takes over the console with + the possible exception of using the fbcon=map:n option. See below. + +4. Driver and fbcon are compiled as a module. + + You can load them in any order. Once both are loaded, fbcon will take + over the console. + +C. Boot options + + The framebuffer console has several, largely unknown, boot options + that can change its behavior. + +1. fbcon=font: + + Select the initial font to use. The value 'name' can be any of the + compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, + PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, VGA8x16, VGA8x8. + + Note, not all drivers can handle font with widths not divisible by 8, + such as vga16fb. + +2. fbcon=scrollback:[k] + + The scrollback buffer is memory that is used to preserve display + contents that has already scrolled past your view. This is accessed + by using the Shift-PageUp key combination. The value 'value' is any + integer. It defaults to 32KB. The 'k' suffix is optional, and will + multiply the 'value' by 1024. + +3. fbcon=map:<0123> + + This is an interesting option. It tells which driver gets mapped to + which console. The value '0123' is a sequence that gets repeated until + the total length is 64 which is the number of consoles available. In + the above example, it is expanded to 012301230123... and the mapping + will be:: + + tty | 1 2 3 4 5 6 7 8 9 ... + fb | 0 1 2 3 0 1 2 3 0 ... + + ('cat /proc/fb' should tell you what the fb numbers are) + + One side effect that may be useful is using a map value that exceeds + the number of loaded fb drivers. For example, if only one driver is + available, fb0, adding fbcon=map:1 tells fbcon not to take over the + console. + + Later on, when you want to map the console the to the framebuffer + device, you can use the con2fbmap utility. + +4. fbcon=vc:- + + This option tells fbcon to take over only a range of consoles as + specified by the values 'n1' and 'n2'. The rest of the consoles + outside the given range will still be controlled by the standard + console driver. + + NOTE: For x86 machines, the standard console is the VGA console which + is typically located on the same video card. Thus, the consoles that + are controlled by the VGA console will be garbled. + +4. fbcon=rotate: + + This option changes the orientation angle of the console display. The + value 'n' accepts the following: + + - 0 - normal orientation (0 degree) + - 1 - clockwise orientation (90 degrees) + - 2 - upside down orientation (180 degrees) + - 3 - counterclockwise orientation (270 degrees) + + The angle can be changed anytime afterwards by 'echoing' the same + numbers to any one of the 2 attributes found in + /sys/class/graphics/fbcon: + + - rotate - rotate the display of the active console + - rotate_all - rotate the display of all consoles + + Console rotation will only become available if Framebuffer Console + Rotation support is compiled in your kernel. + + NOTE: This is purely console rotation. Any other applications that + use the framebuffer will remain at their 'normal' orientation. + Actually, the underlying fb driver is totally ignorant of console + rotation. + +5. fbcon=margin: + + This option specifies the color of the margins. The margins are the + leftover area at the right and the bottom of the screen that are not + used by text. By default, this area will be black. The 'color' value + is an integer number that depends on the framebuffer driver being used. + +6. fbcon=nodefer + + If the kernel is compiled with deferred fbcon takeover support, normally + the framebuffer contents, left in place by the firmware/bootloader, will + be preserved until there actually is some text is output to the console. + This option causes fbcon to bind immediately to the fbdev device. + +7. fbcon=logo-pos: + + The only possible 'location' is 'center' (without quotes), and when + given, the bootup logo is moved from the default top-left corner + location to the center of the framebuffer. If more than one logo is + displayed due to multiple CPUs, the collected line of logos is moved + as a whole. + +C. Attaching, Detaching and Unloading + +Before going on to how to attach, detach and unload the framebuffer console, an +illustration of the dependencies may help. + +The console layer, as with most subsystems, needs a driver that interfaces with +the hardware. Thus, in a VGA console:: + + console ---> VGA driver ---> hardware. + +Assuming the VGA driver can be unloaded, one must first unbind the VGA driver +from the console layer before unloading the driver. The VGA driver cannot be +unloaded if it is still bound to the console layer. (See +Documentation/console/console.txt for more information). + +This is more complicated in the case of the framebuffer console (fbcon), +because fbcon is an intermediate layer between the console and the drivers:: + + console ---> fbcon ---> fbdev drivers ---> hardware + +The fbdev drivers cannot be unloaded if bound to fbcon, and fbcon cannot +be unloaded if it's bound to the console layer. + +So to unload the fbdev drivers, one must first unbind fbcon from the console, +then unbind the fbdev drivers from fbcon. Fortunately, unbinding fbcon from +the console layer will automatically unbind framebuffer drivers from +fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from +fbcon. + +So, how do we unbind fbcon from the console? Part of the answer is in +Documentation/console/console.txt. To summarize: + +Echo a value to the bind file that represents the framebuffer console +driver. So assuming vtcon1 represents fbcon, then:: + + echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to + console layer + echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from + console layer + +If fbcon is detached from the console layer, your boot console driver (which is +usually VGA text mode) will take over. A few drivers (rivafb and i810fb) will +restore VGA text mode for you. With the rest, before detaching fbcon, you +must take a few additional steps to make sure that your VGA text mode is +restored properly. The following is one of the several methods that you can do: + +1. Download or install vbetool. This utility is included with most + distributions nowadays, and is usually part of the suspend/resume tool. + +2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set + to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers. + +3. Boot into text mode and as root run:: + + vbetool vbestate save > + + The above command saves the register contents of your graphics + hardware to . You need to do this step only once as + the state file can be reused. + +4. If fbcon is compiled as a module, load fbcon by doing:: + + modprobe fbcon + +5. Now to detach fbcon:: + + vbetool vbestate restore < && \ + echo 0 > /sys/class/vtconsole/vtcon1/bind + +6. That's it, you're back to VGA mode. And if you compiled fbcon as a module, + you can unload it by 'rmmod fbcon'. + +7. To reattach fbcon:: + + echo 1 > /sys/class/vtconsole/vtcon1/bind + +8. Once fbcon is unbound, all drivers registered to the system will also +become unbound. This means that fbcon and individual framebuffer drivers +can be unloaded or reloaded at will. Reloading the drivers or fbcon will +automatically bind the console, fbcon and the drivers together. Unloading +all the drivers without unloading fbcon will make it impossible for the +console to bind fbcon. + +Notes for vesafb users: +======================= + +Unfortunately, if your bootline includes a vga=xxx parameter that sets the +hardware in graphics mode, such as when loading vesafb, vgacon will not load. +Instead, vgacon will replace the default boot console with dummycon, and you +won't get any display after detaching fbcon. Your machine is still alive, so +you can reattach vesafb. However, to reattach vesafb, you need to do one of +the following: + +Variation 1: + + a. Before detaching fbcon, do:: + + vbetool vbemode save > # do once for each vesafb mode, + # the file can be reused + + b. Detach fbcon as in step 5. + + c. Attach fbcon:: + + vbetool vbestate restore < && \ + echo 1 > /sys/class/vtconsole/vtcon1/bind + +Variation 2: + + a. Before detaching fbcon, do:: + + echo > /sys/class/tty/console/bind + + vbetool vbemode get + + b. Take note of the mode number + + b. Detach fbcon as in step 5. + + c. Attach fbcon:: + + vbetool vbemode set && \ + echo 1 > /sys/class/vtconsole/vtcon1/bind + +Samples: +======== + +Here are 2 sample bash scripts that you can use to bind or unbind the +framebuffer console driver if you are on an X86 box:: + + #!/bin/bash + # Unbind fbcon + + # Change this to where your actual vgastate file is located + # Or Use VGASTATE=$1 to indicate the state file at runtime + VGASTATE=/tmp/vgastate + + # path to vbetool + VBETOOL=/usr/local/bin + + + for (( i = 0; i < 16; i++)) + do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then + if test -x $VBETOOL/vbetool; then + echo Unbinding vtcon$i + $VBETOOL/vbetool vbestate restore < $VGASTATE + echo 0 > /sys/class/vtconsole/vtcon$i/bind + fi + fi + fi + done + +--------------------------------------------------------------------------- + +:: + + #!/bin/bash + # Bind fbcon + + for (( i = 0; i < 16; i++)) + do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then + echo Unbinding vtcon$i + echo 1 > /sys/class/vtconsole/vtcon$i/bind + fi + fi + done + +Antonino Daplas diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt deleted file mode 100644 index 60a5ec04e8f0..000000000000 --- a/Documentation/fb/fbcon.txt +++ /dev/null @@ -1,347 +0,0 @@ -The Framebuffer Console -======================= - - The framebuffer console (fbcon), as its name implies, is a text -console running on top of the framebuffer device. It has the functionality of -any standard text console driver, such as the VGA console, with the added -features that can be attributed to the graphical nature of the framebuffer. - - In the x86 architecture, the framebuffer console is optional, and -some even treat it as a toy. For other architectures, it is the only available -display device, text or graphical. - - What are the features of fbcon? The framebuffer console supports -high resolutions, varying font types, display rotation, primitive multihead, -etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature -made available by the underlying graphics card are also possible. - -A. Configuration - - The framebuffer console can be enabled by using your favorite kernel -configuration tool. It is under Device Drivers->Graphics Support->Frame -buffer Devices->Console display driver support->Framebuffer Console Support. -Select 'y' to compile support statically or 'm' for module support. The -module will be fbcon. - - In order for fbcon to activate, at least one framebuffer driver is -required, so choose from any of the numerous drivers available. For x86 -systems, they almost universally have VGA cards, so vga16fb and vesafb will -always be available. However, using a chipset-specific driver will give you -more speed and features, such as the ability to change the video mode -dynamically. - - To display the penguin logo, choose any logo available in Graphics -support->Bootup logo. - - Also, you will need to select at least one compiled-in font, but if -you don't do anything, the kernel configuration tool will select one for you, -usually an 8x16 font. - -GOTCHA: A common bug report is enabling the framebuffer without enabling the -framebuffer console. Depending on the driver, you may get a blanked or -garbled display, but the system still boots to completion. If you are -fortunate to have a driver that does not alter the graphics chip, then you -will still get a VGA console. - -B. Loading - -Possible scenarios: - -1. Driver and fbcon are compiled statically - - Usually, fbcon will automatically take over your console. The notable - exception is vesafb. It needs to be explicitly activated with the - vga= boot option parameter. - -2. Driver is compiled statically, fbcon is compiled as a module - - Depending on the driver, you either get a standard console, or a - garbled display, as mentioned above. To get a framebuffer console, - do a 'modprobe fbcon'. - -3. Driver is compiled as a module, fbcon is compiled statically - - You get your standard console. Once the driver is loaded with - 'modprobe xxxfb', fbcon automatically takes over the console with - the possible exception of using the fbcon=map:n option. See below. - -4. Driver and fbcon are compiled as a module. - - You can load them in any order. Once both are loaded, fbcon will take - over the console. - -C. Boot options - - The framebuffer console has several, largely unknown, boot options - that can change its behavior. - -1. fbcon=font: - - Select the initial font to use. The value 'name' can be any of the - compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, - PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, VGA8x16, VGA8x8. - - Note, not all drivers can handle font with widths not divisible by 8, - such as vga16fb. - -2. fbcon=scrollback:[k] - - The scrollback buffer is memory that is used to preserve display - contents that has already scrolled past your view. This is accessed - by using the Shift-PageUp key combination. The value 'value' is any - integer. It defaults to 32KB. The 'k' suffix is optional, and will - multiply the 'value' by 1024. - -3. fbcon=map:<0123> - - This is an interesting option. It tells which driver gets mapped to - which console. The value '0123' is a sequence that gets repeated until - the total length is 64 which is the number of consoles available. In - the above example, it is expanded to 012301230123... and the mapping - will be: - - tty | 1 2 3 4 5 6 7 8 9 ... - fb | 0 1 2 3 0 1 2 3 0 ... - - ('cat /proc/fb' should tell you what the fb numbers are) - - One side effect that may be useful is using a map value that exceeds - the number of loaded fb drivers. For example, if only one driver is - available, fb0, adding fbcon=map:1 tells fbcon not to take over the - console. - - Later on, when you want to map the console the to the framebuffer - device, you can use the con2fbmap utility. - -4. fbcon=vc:- - - This option tells fbcon to take over only a range of consoles as - specified by the values 'n1' and 'n2'. The rest of the consoles - outside the given range will still be controlled by the standard - console driver. - - NOTE: For x86 machines, the standard console is the VGA console which - is typically located on the same video card. Thus, the consoles that - are controlled by the VGA console will be garbled. - -4. fbcon=rotate: - - This option changes the orientation angle of the console display. The - value 'n' accepts the following: - - 0 - normal orientation (0 degree) - 1 - clockwise orientation (90 degrees) - 2 - upside down orientation (180 degrees) - 3 - counterclockwise orientation (270 degrees) - - The angle can be changed anytime afterwards by 'echoing' the same - numbers to any one of the 2 attributes found in - /sys/class/graphics/fbcon: - - rotate - rotate the display of the active console - rotate_all - rotate the display of all consoles - - Console rotation will only become available if Framebuffer Console - Rotation support is compiled in your kernel. - - NOTE: This is purely console rotation. Any other applications that - use the framebuffer will remain at their 'normal' orientation. - Actually, the underlying fb driver is totally ignorant of console - rotation. - -5. fbcon=margin: - - This option specifies the color of the margins. The margins are the - leftover area at the right and the bottom of the screen that are not - used by text. By default, this area will be black. The 'color' value - is an integer number that depends on the framebuffer driver being used. - -6. fbcon=nodefer - - If the kernel is compiled with deferred fbcon takeover support, normally - the framebuffer contents, left in place by the firmware/bootloader, will - be preserved until there actually is some text is output to the console. - This option causes fbcon to bind immediately to the fbdev device. - -7. fbcon=logo-pos: - - The only possible 'location' is 'center' (without quotes), and when - given, the bootup logo is moved from the default top-left corner - location to the center of the framebuffer. If more than one logo is - displayed due to multiple CPUs, the collected line of logos is moved - as a whole. - -C. Attaching, Detaching and Unloading - -Before going on to how to attach, detach and unload the framebuffer console, an -illustration of the dependencies may help. - -The console layer, as with most subsystems, needs a driver that interfaces with -the hardware. Thus, in a VGA console: - -console ---> VGA driver ---> hardware. - -Assuming the VGA driver can be unloaded, one must first unbind the VGA driver -from the console layer before unloading the driver. The VGA driver cannot be -unloaded if it is still bound to the console layer. (See -Documentation/console/console.txt for more information). - -This is more complicated in the case of the framebuffer console (fbcon), -because fbcon is an intermediate layer between the console and the drivers: - -console ---> fbcon ---> fbdev drivers ---> hardware - -The fbdev drivers cannot be unloaded if bound to fbcon, and fbcon cannot -be unloaded if it's bound to the console layer. - -So to unload the fbdev drivers, one must first unbind fbcon from the console, -then unbind the fbdev drivers from fbcon. Fortunately, unbinding fbcon from -the console layer will automatically unbind framebuffer drivers from -fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from -fbcon. - -So, how do we unbind fbcon from the console? Part of the answer is in -Documentation/console/console.txt. To summarize: - -Echo a value to the bind file that represents the framebuffer console -driver. So assuming vtcon1 represents fbcon, then: - -echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to - console layer -echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from - console layer - -If fbcon is detached from the console layer, your boot console driver (which is -usually VGA text mode) will take over. A few drivers (rivafb and i810fb) will -restore VGA text mode for you. With the rest, before detaching fbcon, you -must take a few additional steps to make sure that your VGA text mode is -restored properly. The following is one of the several methods that you can do: - -1. Download or install vbetool. This utility is included with most - distributions nowadays, and is usually part of the suspend/resume tool. - -2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set - to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers. - -3. Boot into text mode and as root run: - - vbetool vbestate save > - - The above command saves the register contents of your graphics - hardware to . You need to do this step only once as - the state file can be reused. - -4. If fbcon is compiled as a module, load fbcon by doing: - - modprobe fbcon - -5. Now to detach fbcon: - - vbetool vbestate restore < && \ - echo 0 > /sys/class/vtconsole/vtcon1/bind - -6. That's it, you're back to VGA mode. And if you compiled fbcon as a module, - you can unload it by 'rmmod fbcon'. - -7. To reattach fbcon: - - echo 1 > /sys/class/vtconsole/vtcon1/bind - -8. Once fbcon is unbound, all drivers registered to the system will also -become unbound. This means that fbcon and individual framebuffer drivers -can be unloaded or reloaded at will. Reloading the drivers or fbcon will -automatically bind the console, fbcon and the drivers together. Unloading -all the drivers without unloading fbcon will make it impossible for the -console to bind fbcon. - -Notes for vesafb users: -======================= - -Unfortunately, if your bootline includes a vga=xxx parameter that sets the -hardware in graphics mode, such as when loading vesafb, vgacon will not load. -Instead, vgacon will replace the default boot console with dummycon, and you -won't get any display after detaching fbcon. Your machine is still alive, so -you can reattach vesafb. However, to reattach vesafb, you need to do one of -the following: - -Variation 1: - - a. Before detaching fbcon, do - - vbetool vbemode save > # do once for each vesafb mode, - # the file can be reused - - b. Detach fbcon as in step 5. - - c. Attach fbcon - - vbetool vbestate restore < && \ - echo 1 > /sys/class/vtconsole/vtcon1/bind - -Variation 2: - - a. Before detaching fbcon, do: - echo > /sys/class/tty/console/bind - - - vbetool vbemode get - - b. Take note of the mode number - - b. Detach fbcon as in step 5. - - c. Attach fbcon: - - vbetool vbemode set && \ - echo 1 > /sys/class/vtconsole/vtcon1/bind - -Samples: -======== - -Here are 2 sample bash scripts that you can use to bind or unbind the -framebuffer console driver if you are on an X86 box: - ---------------------------------------------------------------------------- -#!/bin/bash -# Unbind fbcon - -# Change this to where your actual vgastate file is located -# Or Use VGASTATE=$1 to indicate the state file at runtime -VGASTATE=/tmp/vgastate - -# path to vbetool -VBETOOL=/usr/local/bin - - -for (( i = 0; i < 16; i++)) -do - if test -x /sys/class/vtconsole/vtcon$i; then - if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ - = 1 ]; then - if test -x $VBETOOL/vbetool; then - echo Unbinding vtcon$i - $VBETOOL/vbetool vbestate restore < $VGASTATE - echo 0 > /sys/class/vtconsole/vtcon$i/bind - fi - fi - fi -done - ---------------------------------------------------------------------------- -#!/bin/bash -# Bind fbcon - -for (( i = 0; i < 16; i++)) -do - if test -x /sys/class/vtconsole/vtcon$i; then - if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ - = 1 ]; then - echo Unbinding vtcon$i - echo 1 > /sys/class/vtconsole/vtcon$i/bind - fi - fi -done ---------------------------------------------------------------------------- - --- -Antonino Daplas diff --git a/Documentation/fb/framebuffer.rst b/Documentation/fb/framebuffer.rst new file mode 100644 index 000000000000..7fe087310c82 --- /dev/null +++ b/Documentation/fb/framebuffer.rst @@ -0,0 +1,353 @@ +======================= +The Frame Buffer Device +======================= + +Last revised: May 10, 2001 + + +0. Introduction +--------------- + +The frame buffer device provides an abstraction for the graphics hardware. It +represents the frame buffer of some video hardware and allows application +software to access the graphics hardware through a well-defined interface, so +the software doesn't need to know anything about the low-level (hardware +register) stuff. + +The device is accessed through special device nodes, usually located in the +/dev directory, i.e. /dev/fb*. + + +1. User's View of /dev/fb* +-------------------------- + +From the user's point of view, the frame buffer device looks just like any +other device in /dev. It's a character device using major 29; the minor +specifies the frame buffer number. + +By convention, the following device nodes are used (numbers indicate the device +minor numbers):: + + 0 = /dev/fb0 First frame buffer + 1 = /dev/fb1 Second frame buffer + ... + 31 = /dev/fb31 32nd frame buffer + +For backwards compatibility, you may want to create the following symbolic +links:: + + /dev/fb0current -> fb0 + /dev/fb1current -> fb1 + +and so on... + +The frame buffer devices are also `normal` memory devices, this means, you can +read and write their contents. You can, for example, make a screen snapshot by:: + + cp /dev/fb0 myfile + +There also can be more than one frame buffer at a time, e.g. if you have a +graphics card in addition to the built-in hardware. The corresponding frame +buffer devices (/dev/fb0 and /dev/fb1 etc.) work independently. + +Application software that uses the frame buffer device (e.g. the X server) will +use /dev/fb0 by default (older software uses /dev/fb0current). You can specify +an alternative frame buffer device by setting the environment variable +$FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash +users):: + + export FRAMEBUFFER=/dev/fb1 + +or (for csh users):: + + setenv FRAMEBUFFER /dev/fb1 + +After this the X server will use the second frame buffer. + + +2. Programmer's View of /dev/fb* +-------------------------------- + +As you already know, a frame buffer device is a memory device like /dev/mem and +it has the same features. You can read it, write it, seek to some location in +it and mmap() it (the main usage). The difference is just that the memory that +appears in the special file is not the whole memory, but the frame buffer of +some video hardware. + +/dev/fb* also allows several ioctls on it, by which lots of information about +the hardware can be queried and set. The color map handling works via ioctls, +too. Look into for more information on what ioctls exist and on +which data structures they work. Here's just a brief overview: + + - You can request unchangeable information about the hardware, like name, + organization of the screen memory (planes, packed pixels, ...) and address + and length of the screen memory. + + - You can request and change variable information about the hardware, like + visible and virtual geometry, depth, color map format, timing, and so on. + If you try to change that information, the driver maybe will round up some + values to meet the hardware's capabilities (or return EINVAL if that isn't + possible). + + - You can get and set parts of the color map. Communication is done with 16 + bits per color part (red, green, blue, transparency) to support all + existing hardware. The driver does all the computations needed to apply + it to the hardware (round it down to less bits, maybe throw away + transparency). + +All this hardware abstraction makes the implementation of application programs +easier and more portable. E.g. the X server works completely on /dev/fb* and +thus doesn't need to know, for example, how the color registers of the concrete +hardware are organized. XF68_FBDev is a general X server for bitmapped, +unaccelerated video hardware. The only thing that has to be built into +application programs is the screen organization (bitplanes or chunky pixels +etc.), because it works on the frame buffer image data directly. + +For the future it is planned that frame buffer drivers for graphics cards and +the like can be implemented as kernel modules that are loaded at runtime. Such +a driver just has to call register_framebuffer() and supply some functions. +Writing and distributing such drivers independently from the kernel will save +much trouble... + + +3. Frame Buffer Resolution Maintenance +-------------------------------------- + +Frame buffer resolutions are maintained using the utility `fbset`. It can +change the video mode properties of a frame buffer device. Its main usage is +to change the current video mode, e.g. during boot up in one of your `/etc/rc.*` +or `/etc/init.d/*` files. + +Fbset uses a video mode database stored in a configuration file, so you can +easily add your own modes and refer to them with a simple identifier. + + +4. The X Server +--------------- + +The X server (XF68_FBDev) is the most notable application program for the frame +buffer device. Starting with XFree86 release 3.2, the X server is part of +XFree86 and has 2 modes: + + - If the `Display` subsection for the `fbdev` driver in the /etc/XF86Config + file contains a:: + + Modes "default" + + line, the X server will use the scheme discussed above, i.e. it will start + up in the resolution determined by /dev/fb0 (or $FRAMEBUFFER, if set). You + still have to specify the color depth (using the Depth keyword) and virtual + resolution (using the Virtual keyword) though. This is the default for the + configuration file supplied with XFree86. It's the most simple + configuration, but it has some limitations. + + - Therefore it's also possible to specify resolutions in the /etc/XF86Config + file. This allows for on-the-fly resolution switching while retaining the + same virtual desktop size. The frame buffer device that's used is still + /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are + defined by /etc/XF86Config now. The disadvantage is that you have to + specify the timings in a different format (but `fbset -x` may help). + +To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't +work 100% with XF68_FBDev: the reported clock values are always incorrect. + + +5. Video Mode Timings +--------------------- + +A monitor draws an image on the screen by using an electron beam (3 electron +beams for color models, 1 electron beam for monochrome monitors). The front of +the screen is covered by a pattern of colored phosphors (pixels). If a phosphor +is hit by an electron, it emits a photon and thus becomes visible. + +The electron beam draws horizontal lines (scanlines) from left to right, and +from the top to the bottom of the screen. By modifying the intensity of the +electron beam, pixels with various colors and intensities can be shown. + +After each scanline the electron beam has to move back to the left side of the +screen and to the next line: this is called the horizontal retrace. After the +whole screen (frame) was painted, the beam moves back to the upper left corner: +this is called the vertical retrace. During both the horizontal and vertical +retrace, the electron beam is turned off (blanked). + +The speed at which the electron beam paints the pixels is determined by the +dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions +of cycles per second), each pixel is 35242 ps (picoseconds) long:: + + 1/(28.37516E6 Hz) = 35.242E-9 s + +If the screen resolution is 640x480, it will take:: + + 640*35.242E-9 s = 22.555E-6 s + +to paint the 640 (xres) pixels on one scanline. But the horizontal retrace +also takes time (e.g. 272 `pixels`), so a full scanline takes:: + + (640+272)*35.242E-9 s = 32.141E-6 s + +We'll say that the horizontal scanrate is about 31 kHz:: + + 1/(32.141E-6 s) = 31.113E3 Hz + +A full screen counts 480 (yres) lines, but we have to consider the vertical +retrace too (e.g. 49 `lines`). So a full screen will take:: + + (480+49)*32.141E-6 s = 17.002E-3 s + +The vertical scanrate is about 59 Hz:: + + 1/(17.002E-3 s) = 58.815 Hz + +This means the screen data is refreshed about 59 times per second. To have a +stable picture without visible flicker, VESA recommends a vertical scanrate of +at least 72 Hz. But the perceived flicker is very human dependent: some people +can use 50 Hz without any trouble, while I'll notice if it's less than 80 Hz. + +Since the monitor doesn't know when a new scanline starts, the graphics board +will supply a synchronization pulse (horizontal sync or hsync) for each +scanline. Similarly it supplies a synchronization pulse (vertical sync or +vsync) for each new frame. The position of the image on the screen is +influenced by the moments at which the synchronization pulses occur. + +The following picture summarizes all timings. The horizontal retrace time is +the sum of the left margin, the right margin and the hsync length, while the +vertical retrace time is the sum of the upper margin, the lower margin and the +vsync length:: + + +----------+---------------------------------------------+----------+-------+ + | | ↑ | | | + | | |upper_margin | | | + | | ↓ | | | + +----------###############################################----------+-------+ + | # ↑ # | | + | # | # | | + | # | # | | + | # | # | | + | left # | # right | hsync | + | margin # | xres # margin | len | + |<-------->#<---------------+--------------------------->#<-------->|<----->| + | # | # | | + | # | # | | + | # | # | | + | # |yres # | | + | # | # | | + | # | # | | + | # | # | | + | # | # | | + | # | # | | + | # | # | | + | # | # | | + | # | # | | + | # ↓ # | | + +----------###############################################----------+-------+ + | | ↑ | | | + | | |lower_margin | | | + | | ↓ | | | + +----------+---------------------------------------------+----------+-------+ + | | ↑ | | | + | | |vsync_len | | | + | | ↓ | | | + +----------+---------------------------------------------+----------+-------+ + +The frame buffer device expects all horizontal timings in number of dotclocks +(in picoseconds, 1E-12 s), and vertical timings in number of scanlines. + + +6. Converting XFree86 timing values info frame buffer device timings +-------------------------------------------------------------------- + +An XFree86 mode line consists of the following fields:: + + "800x600" 50 800 856 976 1040 600 637 643 666 + < name > DCF HR SH1 SH2 HFL VR SV1 SV2 VFL + +The frame buffer device uses the following fields: + + - pixclock: pixel clock in ps (pico seconds) + - left_margin: time from sync to picture + - right_margin: time from picture to sync + - upper_margin: time from sync to picture + - lower_margin: time from picture to sync + - hsync_len: length of horizontal sync + - vsync_len: length of vertical sync + +1) Pixelclock: + + xfree: in MHz + + fb: in picoseconds (ps) + + pixclock = 1000000 / DCF + +2) horizontal timings: + + left_margin = HFL - SH2 + + right_margin = SH1 - HR + + hsync_len = SH2 - SH1 + +3) vertical timings: + + upper_margin = VFL - SV2 + + lower_margin = SV1 - VR + + vsync_len = SV2 - SV1 + +Good examples for VESA timings can be found in the XFree86 source tree, +under "xc/programs/Xserver/hw/xfree86/doc/modeDB.txt". + + +7. References +------------- + +For more specific information about the frame buffer device and its +applications, please refer to the Linux-fbdev website: + + http://linux-fbdev.sourceforge.net/ + +and to the following documentation: + + - The manual pages for fbset: fbset(8), fb.modes(5) + - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5) + - The mighty kernel sources: + + - linux/drivers/video/ + - linux/include/linux/fb.h + - linux/include/video/ + + + +8. Mailing list +--------------- + +There is a frame buffer device related mailing list at kernel.org: +linux-fbdev@vger.kernel.org. + +Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for +subscription information and archive browsing. + + +9. Downloading +-------------- + +All necessary files can be found at + + ftp://ftp.uni-erlangen.de/pub/Linux/LOCAL/680x0/ + +and on its mirrors. + +The latest version of fbset can be found at + + http://www.linux-fbdev.org/ + + +10. Credits +----------- + +This readme was written by Geert Uytterhoeven, partly based on the original +`X-framebuffer.README` by Roman Hodek and Martin Schaller. Section 6 was +provided by Frank Neumann. + +The frame buffer device abstraction was designed by Martin Schaller. diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.txt deleted file mode 100644 index 58c5ae2e9f59..000000000000 --- a/Documentation/fb/framebuffer.txt +++ /dev/null @@ -1,343 +0,0 @@ - The Frame Buffer Device - ----------------------- - -Maintained by Geert Uytterhoeven -Last revised: May 10, 2001 - - -0. Introduction ---------------- - -The frame buffer device provides an abstraction for the graphics hardware. It -represents the frame buffer of some video hardware and allows application -software to access the graphics hardware through a well-defined interface, so -the software doesn't need to know anything about the low-level (hardware -register) stuff. - -The device is accessed through special device nodes, usually located in the -/dev directory, i.e. /dev/fb*. - - -1. User's View of /dev/fb* --------------------------- - -From the user's point of view, the frame buffer device looks just like any -other device in /dev. It's a character device using major 29; the minor -specifies the frame buffer number. - -By convention, the following device nodes are used (numbers indicate the device -minor numbers): - - 0 = /dev/fb0 First frame buffer - 1 = /dev/fb1 Second frame buffer - ... - 31 = /dev/fb31 32nd frame buffer - -For backwards compatibility, you may want to create the following symbolic -links: - - /dev/fb0current -> fb0 - /dev/fb1current -> fb1 - -and so on... - -The frame buffer devices are also `normal' memory devices, this means, you can -read and write their contents. You can, for example, make a screen snapshot by - - cp /dev/fb0 myfile - -There also can be more than one frame buffer at a time, e.g. if you have a -graphics card in addition to the built-in hardware. The corresponding frame -buffer devices (/dev/fb0 and /dev/fb1 etc.) work independently. - -Application software that uses the frame buffer device (e.g. the X server) will -use /dev/fb0 by default (older software uses /dev/fb0current). You can specify -an alternative frame buffer device by setting the environment variable -$FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash -users): - - export FRAMEBUFFER=/dev/fb1 - -or (for csh users): - - setenv FRAMEBUFFER /dev/fb1 - -After this the X server will use the second frame buffer. - - -2. Programmer's View of /dev/fb* --------------------------------- - -As you already know, a frame buffer device is a memory device like /dev/mem and -it has the same features. You can read it, write it, seek to some location in -it and mmap() it (the main usage). The difference is just that the memory that -appears in the special file is not the whole memory, but the frame buffer of -some video hardware. - -/dev/fb* also allows several ioctls on it, by which lots of information about -the hardware can be queried and set. The color map handling works via ioctls, -too. Look into for more information on what ioctls exist and on -which data structures they work. Here's just a brief overview: - - - You can request unchangeable information about the hardware, like name, - organization of the screen memory (planes, packed pixels, ...) and address - and length of the screen memory. - - - You can request and change variable information about the hardware, like - visible and virtual geometry, depth, color map format, timing, and so on. - If you try to change that information, the driver maybe will round up some - values to meet the hardware's capabilities (or return EINVAL if that isn't - possible). - - - You can get and set parts of the color map. Communication is done with 16 - bits per color part (red, green, blue, transparency) to support all - existing hardware. The driver does all the computations needed to apply - it to the hardware (round it down to less bits, maybe throw away - transparency). - -All this hardware abstraction makes the implementation of application programs -easier and more portable. E.g. the X server works completely on /dev/fb* and -thus doesn't need to know, for example, how the color registers of the concrete -hardware are organized. XF68_FBDev is a general X server for bitmapped, -unaccelerated video hardware. The only thing that has to be built into -application programs is the screen organization (bitplanes or chunky pixels -etc.), because it works on the frame buffer image data directly. - -For the future it is planned that frame buffer drivers for graphics cards and -the like can be implemented as kernel modules that are loaded at runtime. Such -a driver just has to call register_framebuffer() and supply some functions. -Writing and distributing such drivers independently from the kernel will save -much trouble... - - -3. Frame Buffer Resolution Maintenance --------------------------------------- - -Frame buffer resolutions are maintained using the utility `fbset'. It can -change the video mode properties of a frame buffer device. Its main usage is -to change the current video mode, e.g. during boot up in one of your /etc/rc.* -or /etc/init.d/* files. - -Fbset uses a video mode database stored in a configuration file, so you can -easily add your own modes and refer to them with a simple identifier. - - -4. The X Server ---------------- - -The X server (XF68_FBDev) is the most notable application program for the frame -buffer device. Starting with XFree86 release 3.2, the X server is part of -XFree86 and has 2 modes: - - - If the `Display' subsection for the `fbdev' driver in the /etc/XF86Config - file contains a - - Modes "default" - - line, the X server will use the scheme discussed above, i.e. it will start - up in the resolution determined by /dev/fb0 (or $FRAMEBUFFER, if set). You - still have to specify the color depth (using the Depth keyword) and virtual - resolution (using the Virtual keyword) though. This is the default for the - configuration file supplied with XFree86. It's the most simple - configuration, but it has some limitations. - - - Therefore it's also possible to specify resolutions in the /etc/XF86Config - file. This allows for on-the-fly resolution switching while retaining the - same virtual desktop size. The frame buffer device that's used is still - /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are - defined by /etc/XF86Config now. The disadvantage is that you have to - specify the timings in a different format (but `fbset -x' may help). - -To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't -work 100% with XF68_FBDev: the reported clock values are always incorrect. - - -5. Video Mode Timings ---------------------- - -A monitor draws an image on the screen by using an electron beam (3 electron -beams for color models, 1 electron beam for monochrome monitors). The front of -the screen is covered by a pattern of colored phosphors (pixels). If a phosphor -is hit by an electron, it emits a photon and thus becomes visible. - -The electron beam draws horizontal lines (scanlines) from left to right, and -from the top to the bottom of the screen. By modifying the intensity of the -electron beam, pixels with various colors and intensities can be shown. - -After each scanline the electron beam has to move back to the left side of the -screen and to the next line: this is called the horizontal retrace. After the -whole screen (frame) was painted, the beam moves back to the upper left corner: -this is called the vertical retrace. During both the horizontal and vertical -retrace, the electron beam is turned off (blanked). - -The speed at which the electron beam paints the pixels is determined by the -dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions -of cycles per second), each pixel is 35242 ps (picoseconds) long: - - 1/(28.37516E6 Hz) = 35.242E-9 s - -If the screen resolution is 640x480, it will take - - 640*35.242E-9 s = 22.555E-6 s - -to paint the 640 (xres) pixels on one scanline. But the horizontal retrace -also takes time (e.g. 272 `pixels'), so a full scanline takes - - (640+272)*35.242E-9 s = 32.141E-6 s - -We'll say that the horizontal scanrate is about 31 kHz: - - 1/(32.141E-6 s) = 31.113E3 Hz - -A full screen counts 480 (yres) lines, but we have to consider the vertical -retrace too (e.g. 49 `lines'). So a full screen will take - - (480+49)*32.141E-6 s = 17.002E-3 s - -The vertical scanrate is about 59 Hz: - - 1/(17.002E-3 s) = 58.815 Hz - -This means the screen data is refreshed about 59 times per second. To have a -stable picture without visible flicker, VESA recommends a vertical scanrate of -at least 72 Hz. But the perceived flicker is very human dependent: some people -can use 50 Hz without any trouble, while I'll notice if it's less than 80 Hz. - -Since the monitor doesn't know when a new scanline starts, the graphics board -will supply a synchronization pulse (horizontal sync or hsync) for each -scanline. Similarly it supplies a synchronization pulse (vertical sync or -vsync) for each new frame. The position of the image on the screen is -influenced by the moments at which the synchronization pulses occur. - -The following picture summarizes all timings. The horizontal retrace time is -the sum of the left margin, the right margin and the hsync length, while the -vertical retrace time is the sum of the upper margin, the lower margin and the -vsync length. - - +----------+---------------------------------------------+----------+-------+ - | | ↑ | | | - | | |upper_margin | | | - | | ↓ | | | - +----------###############################################----------+-------+ - | # ↑ # | | - | # | # | | - | # | # | | - | # | # | | - | left # | # right | hsync | - | margin # | xres # margin | len | - |<-------->#<---------------+--------------------------->#<-------->|<----->| - | # | # | | - | # | # | | - | # | # | | - | # |yres # | | - | # | # | | - | # | # | | - | # | # | | - | # | # | | - | # | # | | - | # | # | | - | # | # | | - | # | # | | - | # ↓ # | | - +----------###############################################----------+-------+ - | | ↑ | | | - | | |lower_margin | | | - | | ↓ | | | - +----------+---------------------------------------------+----------+-------+ - | | ↑ | | | - | | |vsync_len | | | - | | ↓ | | | - +----------+---------------------------------------------+----------+-------+ - -The frame buffer device expects all horizontal timings in number of dotclocks -(in picoseconds, 1E-12 s), and vertical timings in number of scanlines. - - -6. Converting XFree86 timing values info frame buffer device timings --------------------------------------------------------------------- - -An XFree86 mode line consists of the following fields: - "800x600" 50 800 856 976 1040 600 637 643 666 - < name > DCF HR SH1 SH2 HFL VR SV1 SV2 VFL - -The frame buffer device uses the following fields: - - - pixclock: pixel clock in ps (pico seconds) - - left_margin: time from sync to picture - - right_margin: time from picture to sync - - upper_margin: time from sync to picture - - lower_margin: time from picture to sync - - hsync_len: length of horizontal sync - - vsync_len: length of vertical sync - -1) Pixelclock: - xfree: in MHz - fb: in picoseconds (ps) - - pixclock = 1000000 / DCF - -2) horizontal timings: - left_margin = HFL - SH2 - right_margin = SH1 - HR - hsync_len = SH2 - SH1 - -3) vertical timings: - upper_margin = VFL - SV2 - lower_margin = SV1 - VR - vsync_len = SV2 - SV1 - -Good examples for VESA timings can be found in the XFree86 source tree, -under "xc/programs/Xserver/hw/xfree86/doc/modeDB.txt". - - -7. References -------------- - -For more specific information about the frame buffer device and its -applications, please refer to the Linux-fbdev website: - - http://linux-fbdev.sourceforge.net/ - -and to the following documentation: - - - The manual pages for fbset: fbset(8), fb.modes(5) - - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5) - - The mighty kernel sources: - o linux/drivers/video/ - o linux/include/linux/fb.h - o linux/include/video/ - - - -8. Mailing list ---------------- - -There is a frame buffer device related mailing list at kernel.org: -linux-fbdev@vger.kernel.org. - -Point your web browser to http://sourceforge.net/projects/linux-fbdev/ for -subscription information and archive browsing. - - -9. Downloading --------------- - -All necessary files can be found at - - ftp://ftp.uni-erlangen.de/pub/Linux/LOCAL/680x0/ - -and on its mirrors. - -The latest version of fbset can be found at - - http://www.linux-fbdev.org/ - - -10. Credits ----------- - -This readme was written by Geert Uytterhoeven, partly based on the original -`X-framebuffer.README' by Roman Hodek and Martin Schaller. Section 6 was -provided by Frank Neumann. - -The frame buffer device abstraction was designed by Martin Schaller. diff --git a/Documentation/fb/gxfb.rst b/Documentation/fb/gxfb.rst new file mode 100644 index 000000000000..5738709bccbb --- /dev/null +++ b/Documentation/fb/gxfb.rst @@ -0,0 +1,54 @@ +============= +What is gxfb? +============= + +.. [This file is cloned from VesaFB/aty128fb] + +This is a graphics framebuffer driver for AMD Geode GX2 based processors. + +Advantages: + + * No need to use AMD's VSA code (or other VESA emulation layer) in the + BIOS. + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF68_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... + + +How to use it? +============== + +Switching modes is done using gxfb.mode_option=... boot +parameter or using `fbset` program. + +See Documentation/fb/modedb.rst for more information on modedb +resolutions. + + +X11 +=== + +XF68_FBDev should generally work fine, but it is non-accelerated. + + +Configuration +============= + +You can pass kernel command line options to gxfb with gxfb.
[,
+][;,,,,
[,
+]+] + +Where:: + + ::= The device name. + ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" + ::= The device minor number | "" + ::= "ro" | "rw" +
::= + ::= "verity" | "linear" | ... (see list below) + +The dm line should be equivalent to the one used by the dmsetup tool with the +`--concise` argument. + +Target types +============ + +Not all target types are available as there are serious risks in allowing +activation of certain DM targets without first using userspace tools to check +the validity of associated metadata. + +======================= ======================================================= +`cache` constrained, userspace should verify cache device +`crypt` allowed +`delay` allowed +`era` constrained, userspace should verify metadata device +`flakey` constrained, meant for test +`linear` allowed +`log-writes` constrained, userspace should verify metadata device +`mirror` constrained, userspace should verify main/mirror device +`raid` constrained, userspace should verify metadata device +`snapshot` constrained, userspace should verify src/dst device +`snapshot-origin` allowed +`snapshot-merge` constrained, userspace should verify src/dst device +`striped` allowed +`switch` constrained, userspace should verify dev path +`thin` constrained, requires dm target message from userspace +`thin-pool` constrained, requires dm target message from userspace +`verity` allowed +`writecache` constrained, userspace should verify cache device +`zero` constrained, not meant for rootfs +======================= ======================================================= + +If the target is not listed above, it is constrained by default (not tested). + +Examples +======== +An example of booting to a linear array made up of user-mode linux block +devices:: + + dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 + +This will boot to a rw dm-linear target of 8192 sectors split across two block +devices identified by their major:minor numbers. After boot, udev will rename +this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. + +An example of multiple device-mappers, with the dm-mod.create="..." contents +is shown here split on multiple lines for readability:: + + dm-linear,,1,rw, + 0 32768 linear 8:1 0, + 32768 1024000 linear 8:2 0; + dm-verity,,3,ro, + 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 + ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 + 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 + +Other examples (per target): + +"crypt":: + + dm-crypt,,8,ro, + 0 1048576 crypt aes-xts-plain64 + babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 + /dev/sda 0 1 allow_discards + +"delay":: + + dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 + +"linear":: + + dm-linear,,,rw, + 0 32768 linear /dev/sda1 0, + 32768 1024000 linear /dev/sda2 0, + 1056768 204800 linear /dev/sda3 0, + 1261568 512000 linear /dev/sda4 0 + +"snapshot-origin":: + + dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 + +"striped":: + + dm-striped,,4,ro,0 1638400 striped 4 4096 + /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 + +"verity":: + + dm-verity,,4,ro, + 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 + fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd + 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst new file mode 100644 index 000000000000..a30aa91b5fbe --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -0,0 +1,259 @@ +============ +dm-integrity +============ + +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + +There's an alternate mode of operation where dm-integrity uses bitmap +instead of a journal. If a bit in the bitmap is 1, the corresponding +region's data and integrity tags are not synchronized - if the machine +crashes, the unsynchronized regions will be recalculated. The bitmap mode +is faster than the journal mode, because we don't have to write the data +twice, but it is also less reliable, because if data corruption happens +when the machine crashes, it may not be detected. + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: + +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + + D - direct writes (without journal) + in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes + data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + B - bitmap mode - data and metadata are written without any + synchronization, the driver maintains a bitmap of dirty + regions where data and metadata don't match. This mode can + only be used with internal hash. + R - recovery mode - in this mode, journal is not replayed, + checksums are not checked and writes to the device are not + allowed. This mode is useful for data recovery if the + device cannot be activated in any of the other standard + modes. + +5. the number of additional arguments + +Additional arguments: + +journal_sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave_sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +meta_device:device + Don't interleave the data and metadata on on device. Use a + separate device for metadata. + +buffer_sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal_watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit_time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal_hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +recalculate + Recalculate the integrity tags automatically. It is only valid + when using internal hash. + +journal_crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal_mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + +block_size:number + The size of a data block in bytes. The larger the block size the + less overhead there is for per-block integrity metadata. + Supported values are 512, 1024, 2048 and 4096 bytes. If not + specified the default block size is 512 bytes. + +sectors_per_bit:number + In the bitmap mode, this parameter specifies the number of + 512-byte sectors that corresponds to one bitmap bit. + +bitmap_flush_interval:number + The bitmap flush interval in milliseconds. The metadata buffers + are synchronized when this interval expires. + + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: + +* reserved sectors + (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments + +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags + SB_FLAG_HAVE_JOURNAL_MAC + - a flag is set if journal_mac is used + SB_FLAG_RECALCULATING + - recalculating is in progress + SB_FLAG_DIRTY_BITMAP + - journal area contains the bitmap of dirty + blocks + * log2(sectors per block) + * a position where recalculating finished +* journal + The journal is divided into sections, each section contains: + + * metadata area (4kiB), it contains journal entries + + - every journal entry contains: + + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + + - every metadata sector ends with + + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + + - every sector in the data area contains: + + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. + +* one or more runs of interleaved tags and data. + Each run contains: + + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/Documentation/admin-guide/device-mapper/dm-io.rst b/Documentation/admin-guide/device-mapper/dm-io.rst new file mode 100644 index 000000000000..d2492917a1f5 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-io.rst @@ -0,0 +1,75 @@ +===== +dm-io +===== + +Dm-io provides synchronous and asynchronous I/O services. There are three +types of I/O services available, and each type has a sync and an async +version. + +The user must set up an io_region structure to describe the desired location +of the I/O. Each io_region indicates a block-device along with the starting +sector and size of the region:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +Dm-io can read from one io_region or write to one or more io_regions. Writes +to multiple regions are specified by an array of io_region structures. + +The first I/O service type takes a list of memory pages as the data buffer for +the I/O, along with an offset into the first page:: + + struct page_list { + struct page_list *next; + struct page *page; + }; + + int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + unsigned long *error_bits); + int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page_list *pl, unsigned int offset, + io_notify_fn fn, void *context); + +The second I/O service type takes an array of bio vectors as the data buffer +for the I/O. This service can be handy if the caller has a pre-assembled bio, +but wants to direct different portions of the bio to different devices:: + + int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + unsigned long *error_bits); + int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, + int rw, struct bio_vec *bvec, + io_notify_fn fn, void *context); + +The third I/O service type takes a pointer to a vmalloc'd memory buffer as the +data buffer for the I/O. This service can be handy if the caller needs to do +I/O to a large region but doesn't want to allocate a large number of individual +memory pages:: + + int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, unsigned long *error_bits); + int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, + void *data, io_notify_fn fn, void *context); + +Callers of the asynchronous I/O services must include the name of a completion +callback routine and a pointer to some context data for the I/O:: + + typedef void (*io_notify_fn)(unsigned long error, void *context); + +The "error" parameter in this callback, as well as the `*error` parameter in +all of the synchronous versions, is a bitset (instead of a simple error value). +In the case of an write-I/O to multiple regions, this bitset allows dm-io to +indicate success or failure on each individual region. + +Before using any of the dm-io services, the user should call dm_io_get() +and specify the number of pages they expect to perform I/O on concurrently. +Dm-io will attempt to resize its mempool to make sure enough pages are +always available in order to avoid unnecessary waiting while performing I/O. + +When the user is finished using the dm-io services, they should call +dm_io_put() and specify the same number of pages that were given on the +dm_io_get() call. diff --git a/Documentation/admin-guide/device-mapper/dm-log.rst b/Documentation/admin-guide/device-mapper/dm-log.rst new file mode 100644 index 000000000000..ba4fce39bc27 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-log.rst @@ -0,0 +1,57 @@ +===================== +Device-Mapper Logging +===================== +The device-mapper logging code is used by some of the device-mapper +RAID targets to track regions of the disk that are not consistent. +A region (or portion of the address space) of the disk may be +inconsistent because a RAID stripe is currently being operated on or +a machine died while the region was being altered. In the case of +mirrors, a region would be considered dirty/inconsistent while you +are writing to it because the writes need to be replicated for all +the legs of the mirror and may not reach the legs at the same time. +Once all writes are complete, the region is considered clean again. + +There is a generic logging interface that the device-mapper RAID +implementations use to perform logging operations (see +dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different +logging implementations are available and provide different +capabilities. The list includes: + +============== ============================================================== +Type Files +============== ============================================================== +disk drivers/md/dm-log.c +core drivers/md/dm-log.c +userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h +============== ============================================================== + +The "disk" log type +------------------- +This log implementation commits the log state to disk. This way, the +logging state survives reboots/crashes. + +The "core" log type +------------------- +This log implementation keeps the log state in memory. The log state +will not survive a reboot or crash, but there may be a small boost in +performance. This method can also be used if no storage device is +available for storing log state. + +The "userspace" log type +------------------------ +This log type simply provides a way to export the log API to userspace, +so log implementations can be done there. This is done by forwarding most +logging requests to userspace, where a daemon receives and processes the +request. + +The structure used for communication between kernel and userspace are +located in include/linux/dm-log-userspace.h. Due to the frequency, +diversity, and 2-way communication nature of the exchanges between +kernel and userspace, 'connector' is used as the interface for +communication. + +There are currently two userspace log implementations that leverage this +framework - "clustered-disk" and "clustered-core". These implementations +provide a cluster-coherent log for shared-storage. Device-mapper mirroring +can be used in a shared-storage environment when the cluster log implementations +are employed. diff --git a/Documentation/admin-guide/device-mapper/dm-queue-length.rst b/Documentation/admin-guide/device-mapper/dm-queue-length.rst new file mode 100644 index 000000000000..d8e381c1cb02 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-queue-length.rst @@ -0,0 +1,48 @@ +=============== +dm-queue-length +=============== + +dm-queue-length is a path selector module for device-mapper targets, +which selects a path with the least number of in-flight I/Os. +The path selector name is 'queue-length'. + +Table parameters for each path: [] + +:: + + : The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + +Status for each path: + +:: + + : 'A' if the path is active, 'F' if the path is failed. + : The number of path failures. + : The number of in-flight I/Os on the path. + + +Algorithm +========= + +dm-queue-length increments/decrements 'in-flight' when an I/O is +dispatched/completed respectively. +dm-queue-length selects a path with the minimum 'in-flight'. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128. + +:: + + # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst new file mode 100644 index 000000000000..2fe255b130fb --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -0,0 +1,419 @@ +======= +dm-raid +======= + +The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. +It allows the MD RAID drivers to be accessed using a device-mapper +interface. + + +Mapping Table Interface +----------------------- +The target is named "raid" and it accepts the following parameters:: + + <#raid_params> \ + <#raid_devs> [.. ] + +: + + ============= =============================================================== + raid0 RAID0 striping (no resilience) + raid1 RAID1 mirroring + raid4 RAID4 with dedicated last parity disk + raid5_n RAID5 with dedicated last parity disk supporting takeover + Same as raid4 + + - Transitory layout + raid5_la RAID5 left asymmetric + + - rotating parity 0 with data continuation + raid5_ra RAID5 right asymmetric + + - rotating parity N with data continuation + raid5_ls RAID5 left symmetric + + - rotating parity 0 with data restart + raid5_rs RAID5 right symmetric + + - rotating parity N with data restart + raid6_zr RAID6 zero restart + + - rotating parity zero (left-to-right) with data restart + raid6_nr RAID6 N restart + + - rotating parity N (right-to-left) with data restart + raid6_nc RAID6 N continue + + - rotating parity N (right-to-left) with data continuation + raid6_n_6 RAID6 with dedicate parity disks + + - parity and Q-syndrome on the last 2 disks; + layout for takeover from/to raid4/raid5_n + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + + - layout for takeover from raid5_la from/to raid6 + raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ra from/to raid6 + raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk + + - layout for takeover from raid5_ls from/to raid6 + raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk + + - layout for takeover from raid5_rs from/to raid6 + raid10 Various RAID10 inspired algorithms chosen by additional params + (see raid10_format and raid10_copies below) + + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') + - RAID1E: Integrated Adjacent Stripe Mirroring + - RAID1E: Integrated Offset Stripe Mirroring + - and other similar RAID10 variants + ============= =============================================================== + + Reference: Chapter 4 of + http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf + +<#raid_params>: The number of parameters that follow. + + consists of + + Mandatory parameters: + : + Chunk size in sectors. This parameter is often known as + "stripe size". It is the only mandatory parameter and + is placed first. + + followed by optional parameters (in any order): + [sync|nosync] + Force or prevent RAID initialization. + + [rebuild ] + Rebuild drive number 'idx' (first drive is 0). + + [daemon_sleep ] + Interval between runs of the bitmap daemon that + clear bits. A longer interval means less bitmap I/O but + resyncing after a failure is likely to take longer. + + [min_recovery_rate ] + Throttle RAID initialization + [max_recovery_rate ] + Throttle RAID initialization + [write_mostly ] + Mark drive index 'idx' write-mostly. + [max_write_behind ] + See '--write-behind=' (man mdadm) + [stripe_cache ] + Stripe cache size (RAID 4/5/6 only) + [region_size ] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. + + [raid10_copies <# copies>], [raid10_format ] + These two options are used to alter the default layout of + a RAID10 configuration. The number of copies is can be + specified, but the default is 2. There are also three + variations to how the copies are laid down - the default + is "near". Near copies are what most people think of with + respect to mirroring. If these options are left unspecified, + or 'raid10_copies 2' and/or 'raid10_format near' are given, + then the layouts for 2, 3 and 4 devices are: + + ======== ========== ============== + 2 drives 3 drives 4 drives + ======== ========== ============== + A1 A1 A1 A1 A2 A1 A1 A2 A2 + A2 A2 A2 A3 A3 A3 A3 A4 A4 + A3 A3 A4 A4 A5 A5 A5 A6 A6 + A4 A4 A5 A6 A6 A7 A7 A8 A8 + .. .. .. .. .. .. .. .. .. + ======== ========== ============== + + The 2-device layout is equivalent 2-way RAID1. The 4-device + layout is what a traditional RAID10 would look like. The + 3-device layout is what might be called a 'RAID1E - Integrated + Adjacent Stripe Mirroring'. + + If 'raid10_copies 2' and 'raid10_format far', then the layouts + for 2, 3 and 4 devices are: + + ======== ============ =================== + 2 drives 3 drives 4 drives + ======== ============ =================== + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + .. .. .. .. .. .. .. .. .. + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ============ =================== + + If 'raid10_copies 2' and 'raid10_format offset', then the + layouts for 2, 3 and 4 devices are: + + ======== ========== ================ + 2 drives 3 drives 4 drives + ======== ========== ================ + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + ======== ========== ================ + + Here we see layouts closely akin to 'RAID1E - Integrated + Offset Stripe Mirroring'. + + [delta_disks ] + The delta_disks option value (-251 < N < +251) triggers + device removal (negative value) or device addition (positive + value) to any reshape supporting raid levels 4/5/6 and 10. + RAID levels 4/5/6 allow for addition of devices (metadata + and data device tuple), raid10_near and raid10_offset only + allow for device addition. raid10_far does not support any + reshaping at all. + A minimum of devices have to be kept to enforce resilience, + which is 3 devices for raid4/5 and 4 devices for raid6. + + [data_offset ] + This option value defines the offset into each data device + where the data starts. This is used to provide out-of-place + reshaping space to avoid writing over data while + changing the layout of stripes, hence an interruption/crash + may happen at any time without the risk of losing data. + E.g. when adding devices to an existing raid set during + forward reshaping, the out-of-place space will be allocated + at the beginning of each raid device. The kernel raid4/5/6/10 + MD personalities supporting such device addition will read the data from + the existing first stripes (those with smaller number of stripes) + starting at data_offset to fill up a new stripe with the larger + number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) + and write that new stripe to offset 0. Same will be applied to all + N-1 other new stripes. This out-of-place scheme is used to change + the RAID type (i.e. the allocation algorithm) as well, e.g. + changing from raid5_ls to raid5_n. + + [journal_dev ] + This option adds a journal device to raid4/5/6 raid sets and + uses it to close the 'write hole' caused by the non-atomic updates + to the component devices which can cause data loss during recovery. + The journal device is used as writethrough thus causing writes to + be throttled versus non-journaled raid4/5/6 sets. + Takeover/reshape is not possible with a raid4/5/6 journal device; + it has to be deconfigured before requesting these. + + [journal_mode ] + This option sets the caching mode on journaled raid4/5/6 raid sets + (see 'journal_dev ' above) to 'writethrough' or 'writeback'. + If 'writeback' is selected the journal device has to be resilient + and must not suffer from the 'write hole' problem itself (e.g. use + raid1 or raid10) to avoid a single point of failure. + +<#raid_devs>: The number of devices composing the array. + Each device consists of two entries. The first is the device + containing the metadata (if any); the second is the one containing the + data. A Maximum of 64 metadata/data device entries are supported + up to target version 1.8.0. + 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. + + If a drive has failed or is missing at creation time, a '-' can be + given for both the metadata and data drives for a given position. + + +Example Tables +-------------- + +:: + + # RAID4 - 4 data drives, 1 parity (no metadata devices) + # No metadata devices specified to hold superblock/bitmap info + # Chunk size of 1MiB + # (Lines separated for easy reading) + + 0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + + # RAID4 - 4 data drives, 1 parity (with metadata devices) + # Chunk size of 1MiB, force RAID initialization, + # min recovery rate at 20 kiB/sec/disk + + 0 1960893648 raid \ + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 + + +Status Output +------------- +'dmsetup table' displays the table used to construct the mapping. +The optional parameters are always printed in the order listed +above with "sync" or "nosync" always output ahead of the other +arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. + + +'dmsetup status' yields information on the state and health of the array. +The output is as follows (normally a single line, but expanded here for +clarity):: + + 1: raid \ + 2: <#devices> \ + 3: + +Line 1 is the standard output produced by device-mapper. + +Line 2 & 3 are produced by the raid target and are best explained by example:: + + 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 + +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with its initial +recovery. Here is a fuller description of the individual fields: + + =============== ========================================================= + Same as the used to create the array. + One char for each device, indicating: + + - 'A' = alive and in-sync + - 'a' = alive but not in-sync + - 'D' = dead/failed. + The ratio indicating how much of the array has undergone + the process described by 'sync_action'. If the + 'sync_action' is "check" or "repair", then the process + of "resync" or "recover" can be considered complete. + One of the following possible states: + + idle + - No synchronization action is being performed. + frozen + - The current action has been halted. + resync + - Array is undergoing its initial synchronization + or is resynchronizing after an unclean shutdown + (possibly aided by a bitmap). + recover + - A device in the array is being rebuilt or + replaced. + check + - A user-initiated full check of the array is + being performed. All blocks are read and + checked for consistency. The number of + discrepancies found are recorded in + . No changes are made to the + array by this action. + repair + - The same as "check", but discrepancies are + corrected. + reshape + - The array is undergoing a reshape. + The number of discrepancies found between mirror copies + in RAID1/10 or wrong parity values found in RAID4/5/6. + This value is valid only after a "check" of the array + is performed. A healthy array has a 'mismatch_cnt' of 0. + The current data offset to the start of the user data on + each component device of a raid set (see the respective + raid parameter to support out-of-place reshaping). + - 'A' - active write-through journal device. + - 'a' - active write-back journal device. + - 'D' - dead journal device. + - '-' - no journal device. + =============== ========================================================= + + +Message Interface +----------------- +The dm-raid target will accept certain actions through the 'message' interface. +('man dmsetup' for more information on the message interface.) These actions +include: + + ========= ================================================ + "idle" Halt the current sync action. + "frozen" Freeze the current sync action. + "resync" Initiate/continue a resync. + "recover" Initiate/continue a recover process. + "check" Initiate a check (i.e. a "scrub") of the array. + "repair" Initiate a repair of the array. + ========= ================================================ + + +Discard Support +--------------- +The implementation of discard support among hardware vendors varies. +When a block is discarded, some storage devices will return zeroes when +the block is read. These devices set the 'discard_zeroes_data' +attribute. Other devices will return random data. Confusingly, some +devices that advertise 'discard_zeroes_data' will not reliably return +zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks +from a number of devices to calculate parity blocks and (for performance +reasons) relies on 'discard_zeroes_data' being reliable, it is important +that the devices be consistent. Blocks may be discarded in the middle +of a RAID 4/5/6 stripe and if subsequent read results are not +consistent, the parity blocks may be calculated differently at any time; +making the parity blocks useless for redundancy. It is important to +understand how your hardware behaves with discards if you are going to +enable discards with RAID 4/5/6. + +Since the behavior of storage devices is unreliable in this respect, +even when reporting 'discard_zeroes_data', by default RAID 4/5/6 +discard support is disabled -- this ensures data integrity at the +expense of losing some performance. + +Storage devices that properly support 'discard_zeroes_data' are +increasingly whitelisted in the kernel and can thus be trusted. + +For trusted devices, the following dm-raid module parameter can be set +to safely enable discard support for RAID 4/5/6: + + 'devices_handle_discards_safely' + + +Version History +--------------- + +:: + + 1.0.0 Initial version. Support for RAID 4/5/6 + 1.1.0 Added support for RAID 1 + 1.2.0 Handle creation of arrays that contain failed devices. + 1.3.0 Added support for RAID 10 + 1.3.1 Allow device replacement/rebuild for RAID 10 + 1.3.2 Fix/improve redundancy checking for RAID10 + 1.4.0 Non-functional change. Removes arg from mapping function. + 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). + 1.4.2 Add RAID10 "far" and "offset" algorithm support. + 1.5.0 Add message interface to allow manipulation of the sync_action. + New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. + 1.5.1 Add ability to restore transiently failed devices on resume. + 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". + 1.6.0 Add discard support (and devices_handle_discard_safely module param). + 1.7.0 Add support for MD RAID0 mappings. + 1.8.0 Explicitly check for compatible flags in the superblock metadata + and reject to start the raid set if any are set by a newer + target version, thus avoiding data corruption on a raid set + with a reshape in progress. + 1.9.0 Add support for RAID level takeover/reshape/region size + and set size reduction. + 1.9.1 Fix activation of existing RAID 4/10 mapped devices + 1.9.2 Don't emit '- -' on the status table line in case the constructor + fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and + 'D' on the status line. If '- -' is passed into the constructor, emit + '- -' on the table line and '-' as the status line health character. + 1.10.0 Add support for raid4/5/6 journal device + 1.10.1 Fix data corruption on reshape request + 1.11.0 Fix table line argument order + (wrong raid10_copies/raid10_format sequence) + 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option + 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available + 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') + 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an + state races. + 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen + 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape + deadlock/potential data corruption. Update superblock when + specific devices are requested via rebuild. Fix RAID leg + rebuild errors. diff --git a/Documentation/admin-guide/device-mapper/dm-service-time.rst b/Documentation/admin-guide/device-mapper/dm-service-time.rst new file mode 100644 index 000000000000..facf277fc13c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-service-time.rst @@ -0,0 +1,101 @@ +=============== +dm-service-time +=============== + +dm-service-time is a path selector module for device-mapper targets, +which selects a path with the shortest estimated service time for +the incoming I/O. + +The service time for each path is estimated by dividing the total size +of in-flight I/Os on a path with the performance value of the path. +The performance value is a relative throughput value among all paths +in a path-group, and it can be specified as a table argument. + +The path selector name is 'service-time'. + +Table parameters for each path: + + [ []] + : + The number of I/Os to dispatch using the selected + path before switching to the next path. + If not given, internal default is used. To check + the default value, see the activated table. + : + The relative throughput value of the path + among all paths in the path-group. + The valid range is 0-100. + If not given, minimum value '1' is used. + If '0' is given, the path isn't selected while + other paths having a positive value are available. + +Status for each path: + + + : + 'A' if the path is active, 'F' if the path is failed. + : + The number of path failures. + : + The size of in-flight I/Os on the path. + : + The relative throughput value of the path + among all paths in the path-group. + + +Algorithm +========= + +dm-service-time adds the I/O size to 'in-flight-size' when the I/O is +dispatched and subtracts when completed. +Basically, dm-service-time selects a path having minimum service time +which is calculated by:: + + ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' + +However, some optimizations below are used to reduce the calculation +as much as possible. + + 1. If the paths have the same 'relative_throughput', skip + the division and just compare the 'in-flight-size'. + + 2. If the paths have the same 'in-flight-size', skip the division + and just compare the 'relative_throughput'. + + 3. If some paths have non-zero 'relative_throughput' and others + have zero 'relative_throughput', ignore those paths with zero + 'relative_throughput'. + +If such optimizations can't be applied, calculate service time, and +compare service time. +If calculated service time is equal, the path having maximum +'relative_throughput' may be better. So compare 'relative_throughput' +then. + + +Examples +======== +In case that 2 paths (sda and sdb) are used with repeat_count == 128 +and sda has an average throughput 1GB/s and sdb has 4GB/s, +'relative_throughput' value may be '1' for sda and '4' for sdb:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 + + +Or '2' for sda and '8' for sdb would be also true:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/admin-guide/device-mapper/dm-uevent.rst b/Documentation/admin-guide/device-mapper/dm-uevent.rst new file mode 100644 index 000000000000..4a8ee8d069c9 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-uevent.rst @@ -0,0 +1,110 @@ +==================== +device-mapper uevent +==================== + +The device-mapper uevent code adds the capability to device-mapper to create +and send kobject uevents (uevents). Previously device-mapper events were only +available through the ioctl interface. The advantage of the uevents interface +is the event contains environment attributes providing increased context for +the event avoiding the need to query the state of the device-mapper device after +the event is received. + +There are two functions currently for device-mapper events. The first function +listed creates the event and the second function sends the event(s):: + + void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) + + void dm_send_uevents(struct list_head *events, struct kobject *kobj) + + +The variables added to the uevent environment are: + +Variable Name: DM_TARGET +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Name of device-mapper target that generated the event. + +Variable Name: DM_ACTION +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Device-mapper specific action that caused the uevent action. + PATH_FAILED - A path has failed; + PATH_REINSTATED - A path has been reinstated. + +Variable Name: DM_SEQNUM +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: A sequence number for this specific device-mapper device. +:Value: Valid unsigned integer range. + +Variable Name: DM_PATH +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Major and minor number of the path device pertaining to this + event. +:Value: Path name in the form of "Major:Minor" + +Variable Name: DM_NR_VALID_PATHS +-------------------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: +:Value: Valid unsigned integer range. + +Variable Name: DM_NAME +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Name of the device-mapper device. +:Value: Name + +Variable Name: DM_UUID +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: UUID of the device-mapper device. +:Value: UUID. (Empty string if there isn't one.) + +An example of the uevents generated as captured by udevmonitor is shown +below + +1.) Path failure:: + + UEVENT[1192521009.711215] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_FAILED + DM_SEQNUM=1 + DM_PATH=8:32 + DM_NR_VALID_PATHS=0 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1130 + +2.) Path reinstate:: + + UEVENT[1192521132.989927] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_REINSTATED + DM_SEQNUM=2 + DM_PATH=8:32 + DM_NR_VALID_PATHS=1 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1131 diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst new file mode 100644 index 000000000000..07f56ebc1730 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst @@ -0,0 +1,146 @@ +======== +dm-zoned +======== + +The dm-zoned device mapper target exposes a zoned block device (ZBC and +ZAC compliant devices) as a regular block device without any write +pattern constraints. In effect, it implements a drive-managed zoned +block device which hides from the user (a file system or an application +doing raw block device accesses) the sequential write constraints of +host-managed zoned block devices and can mitigate the potential +device-side performance degradation due to excessive random writes on +host-aware zoned block devices. + +For a more detailed description of the zoned block device models and +their constraints see (for SCSI devices): + +http://www.t10.org/drafts.htm#ZBC_Family + +and (for ATA devices): + +http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf + +The dm-zoned implementation is simple and minimizes system overhead (CPU +and memory usage as well as storage capacity loss). For a 10TB +host-managed disk with 256 MB zones, dm-zoned memory usage per disk +instance is at most 4.5 MB and as little as 5 zones will be used +internally for storing metadata and performaing reclaim operations. + +dm-zoned target devices are formatted and checked using the dmzadm +utility available at: + +https://github.com/hgst/dm-zoned-tools + +Algorithm +========= + +dm-zoned implements an on-disk buffering scheme to handle non-sequential +write accesses to the sequential zones of a zoned block device. +Conventional zones are used for caching as well as for storing internal +metadata. + +The zones of the device are separated into 2 types: + +1) Metadata zones: these are conventional zones used to store metadata. +Metadata zones are not reported as useable capacity to the user. + +2) Data zones: all remaining zones, the vast majority of which will be +sequential zones used exclusively to store user data. The conventional +zones of the device may be used also for buffering user random writes. +Data in these zones may be directly mapped to the conventional zone, but +later moved to a sequential zone so that the conventional zone can be +reused for buffering incoming random writes. + +dm-zoned exposes a logical device with a sector size of 4096 bytes, +irrespective of the physical sector size of the backend zoned block +device being used. This allows reducing the amount of metadata needed to +manage valid blocks (blocks written). + +The on-disk metadata format is as follows: + +1) The first block of the first conventional zone found contains the +super block which describes the on disk amount and position of metadata +blocks. + +2) Following the super block, a set of blocks is used to describe the +mapping of the logical device blocks. The mapping is done per chunk of +blocks, with the chunk size equal to the zoned block device size. The +mapping table is indexed by chunk number and each mapping entry +indicates the zone number of the device storing the chunk of data. Each +mapping entry may also indicate if the zone number of a conventional +zone used to buffer random modification to the data zone. + +3) A set of blocks used to store bitmaps indicating the validity of +blocks in the data zones follows the mapping table. A valid block is +defined as a block that was written and not discarded. For a buffered +data chunk, a block is always valid only in the data zone mapping the +chunk or in the buffer zone of the chunk. + +For a logical chunk mapped to a conventional zone, all write operations +are processed by directly writing to the zone. If the mapping zone is a +sequential zone, the write operation is processed directly only if the +write offset within the logical chunk is equal to the write pointer +offset within of the sequential data zone (i.e. the write operation is +aligned on the zone write pointer). Otherwise, write operations are +processed indirectly using a buffer zone. In that case, an unused +conventional zone is allocated and assigned to the chunk being +accessed. Writing a block to the buffer zone of a chunk will +automatically invalidate the same block in the sequential zone mapping +the chunk. If all blocks of the sequential zone become invalid, the zone +is freed and the chunk buffer zone becomes the primary zone mapping the +chunk, resulting in native random write performance similar to a regular +block device. + +Read operations are processed according to the block validity +information provided by the bitmaps. Valid blocks are read either from +the sequential zone mapping a chunk, or if the chunk is buffered, from +the buffer zone assigned. If the accessed chunk has no mapping, or the +accessed blocks are invalid, the read buffer is zeroed and the read +operation terminated. + +After some time, the limited number of convnetional zones available may +be exhausted (all used to map chunks or buffer sequential zones) and +unaligned writes to unbuffered chunks become impossible. To avoid this +situation, a reclaim process regularly scans used conventional zones and +tries to reclaim the least recently used zones by copying the valid +blocks of the buffer zone to a free sequential zone. Once the copy +completes, the chunk mapping is updated to point to the sequential zone +and the buffer zone freed for reuse. + +Metadata Protection +=================== + +To protect metadata against corruption in case of sudden power loss or +system crash, 2 sets of metadata zones are used. One set, the primary +set, is used as the main metadata region, while the secondary set is +used as a staging area. Modified metadata is first written to the +secondary set and validated by updating the super block in the secondary +set, a generation counter is used to indicate that this set contains the +newest metadata. Once this operation completes, in place of metadata +block updates can be done in the primary metadata set. This ensures that +one of the set is always consistent (all modifications committed or none +at all). Flush operations are used as a commit point. Upon reception of +a flush request, metadata modification activity is temporarily blocked +(for both incoming BIO processing and reclaim process) and all dirty +metadata blocks are staged and updated. Normal operation is then +resumed. Flushing metadata thus only temporarily delays write and +discard requests. Read requests can be processed concurrently while +metadata flush is being executed. + +Usage +===== + +A zoned block device must first be formatted using the dmzadm tool. This +will analyze the device zone configuration, determine where to place the +metadata sets on the device and initialize the metadata sets. + +Ex:: + + dmzadm --format /dev/sdxx + +For a formatted device, the target can be created normally with the +dmsetup utility. The only parameter that dm-zoned requires is the +underlying zoned block device name. Ex:: + + echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ + dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/admin-guide/device-mapper/era.rst b/Documentation/admin-guide/device-mapper/era.rst new file mode 100644 index 000000000000..90dd5c670b9f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/era.rst @@ -0,0 +1,116 @@ +====== +dm-era +====== + +Introduction +============ + +dm-era is a target that behaves similar to the linear target. In +addition it keeps track of which blocks were written within a user +defined period of time called an 'era'. Each era target instance +maintains the current era as a monotonically increasing 32-bit +counter. + +Use cases include tracking changed blocks for backup software, and +partially invalidating the contents of a cache to restore cache +coherency after rolling back a vendor snapshot. + +Constructor +=========== + +era + + ================ ====================================================== + metadata dev fast device holding the persistent metadata + origin dev device holding data blocks that may change + block size block size of origin data device, granularity that is + tracked by the target + ================ ====================================================== + +Messages +======== + +None of the dm messages take any arguments. + +checkpoint +---------- + +Possibly move to a new era. You shouldn't assume the era has +incremented. After sending this message, you should check the +current era via the status line. + +take_metadata_snap +------------------ + +Create a clone of the metadata, to allow a userland process to read it. + +drop_metadata_snap +------------------ + +Drop the metadata snapshot. + +Status +====== + + <#used metadata blocks>/<#total metadata blocks> + + +========================= ============================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +current era The current era +held metadata root The location, in blocks, of the metadata root + that has been 'held' for userspace read + access. '-' indicates there is no held root +========================= ============================================== + +Detailed use case +================= + +The scenario of invalidating a cache when rolling back a vendor +snapshot was the primary use case when developing this target: + +Taking a vendor snapshot +------------------------ + +- Send a checkpoint message to the era target +- Make a note of the current era in its status line +- Take vendor snapshot (the era and snapshot should be forever + associated now). + +Rolling back to an vendor snapshot +---------------------------------- + +- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) +- Rollback vendor storage +- Take metadata snapshot +- Ascertain which blocks have been written since the snapshot was taken + by checking each block's era +- Invalidate those blocks in the caching software +- Cache returns to writeback/writethrough mode + +Memory usage +============ + +The target uses a bitset to record writes in the current era. It also +has a spare bitset ready for switching over to a new era. Other than +that it uses a few 4k blocks for updating metadata:: + + (4 * nr_blocks) bytes + buffers + +Resilience +========== + +Metadata is updated on disk before a write to a previously unwritten +block is performed. As such dm-era should not be effected by a hard +crash such as power failure. + +Userland tools +============== + +Userland tools are found in the increasingly poorly named +thin-provisioning-tools project: + + https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst new file mode 100644 index 000000000000..c77c58b8f67b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/index.rst @@ -0,0 +1,42 @@ +============= +Device Mapper +============= + +.. toctree:: + :maxdepth: 1 + + cache-policies + cache + delay + dm-crypt + dm-flakey + dm-init + dm-integrity + dm-io + dm-log + dm-queue-length + dm-raid + dm-service-time + dm-uevent + dm-zoned + era + kcopyd + linear + log-writes + persistent-data + snapshot + statistics + striped + switch + thin-provisioning + unstriped + verity + writecache + zero + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/device-mapper/kcopyd.rst b/Documentation/admin-guide/device-mapper/kcopyd.rst new file mode 100644 index 000000000000..7651d395127f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/kcopyd.rst @@ -0,0 +1,47 @@ +====== +kcopyd +====== + +Kcopyd provides the ability to copy a range of sectors from one block-device +to one or more other block-devices, with an asynchronous completion +notification. It is used by dm-snapshot and dm-mirror. + +Users of kcopyd must first create a client and indicate how many memory pages +to set aside for their copy jobs. This is done with a call to +kcopyd_client_create():: + + int kcopyd_client_create(unsigned int num_pages, + struct kcopyd_client **result); + +To start a copy job, the user must set up io_region structures to describe +the source and destinations of the copy. Each io_region indicates a +block-device along with the starting sector and size of the region. The source +of the copy is given as one io_region structure, and the destinations of the +copy are given as an array of io_region structures:: + + struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; + }; + +To start the copy, the user calls kcopyd_copy(), passing in the client +pointer, pointers to the source and destination io_regions, the name of a +completion callback routine, and a pointer to some context data for the copy:: + + int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + + typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, + void *context); + +When the copy completes, kcopyd will call the user's completion routine, +passing back the user's context pointer. It will also indicate if a read or +write error occurred during the copy. + +When a user is done with all their copy jobs, they should call +kcopyd_client_destroy() to delete the kcopyd client, which will release the +associated memory pages:: + + void kcopyd_client_destroy(struct kcopyd_client *kc); diff --git a/Documentation/admin-guide/device-mapper/linear.rst b/Documentation/admin-guide/device-mapper/linear.rst new file mode 100644 index 000000000000..9d17fc6e64a9 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/linear.rst @@ -0,0 +1,63 @@ +========= +dm-linear +========= + +Device-Mapper's "linear" target maps a linear range of the Device-Mapper +device onto a linear range of another device. This is the basic building +block of logical volume managers. + +Parameters: + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + + +Example scripts +=============== + +:: + + #!/bin/sh + # Create an identity mapping for a device + echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity + +:: + + #!/bin/sh + # Join 2 devices together + size1=`blockdev --getsz $1` + size2=`blockdev --getsz $2` + echo "0 $size1 linear $1 0 + $size1 $size2 linear $2 0" | dmsetup create joined + +:: + + #!/usr/bin/perl -w + # Split a device into 4M chunks and then join them together in reverse order. + + my $name = "reverse"; + my $extent_size = 4 * 1024 * 2; + my $dev = $ARGV[0]; + my $table = ""; + my $count = 0; + + if (!defined($dev)) { + die("Please specify a device.\n"); + } + + my $dev_size = `blockdev --getsz $dev`; + my $extents = int($dev_size / $extent_size) - + (($dev_size % $extent_size) ? 1 : 0); + + while ($extents > 0) { + my $this_start = $count * $extent_size; + $extents--; + $count++; + my $this_offset = $extents * $extent_size; + + $table .= "$this_start $extent_size linear $dev $this_offset\n"; + } + + `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/admin-guide/device-mapper/log-writes.rst b/Documentation/admin-guide/device-mapper/log-writes.rst new file mode 100644 index 000000000000..23141f2ffb7c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/log-writes.rst @@ -0,0 +1,145 @@ +============= +dm-log-writes +============= + +This target takes 2 devices, one to pass all IO to normally, and one to log all +of the write operations to. This is intended for file system developers wishing +to verify the integrity of metadata or data as the file system is written to. +There is a log_write_entry written for every WRITE request and the target is +able to take arbitrary data from userspace to insert into the log. The data +that is in the WRITE requests is copied into the log to make the replay happen +exactly as it happened originally. + +Log Ordering +============ + +We log things in order of completion once we are sure the write is no longer in +cache. This means that normal WRITE requests are not actually logged until the +next REQ_PREFLUSH request. This is to make it easier for userspace to replay +the log in a way that correlates to what is on disk and not what is in cache, +to make it easier to detect improper waiting/flushing. + +This works by attaching all WRITE requests to a list once the write completes. +Once we see a REQ_PREFLUSH request we splice this list onto the request and once +the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only +completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to +simulate the worst case scenario with regard to power failures. Consider the +following example (W means write, C means complete): + + W1,W2,W3,C3,C2,Wflush,C1,Cflush + +The log would show the following: + + W3,W2,flush,W1.... + +Again this is to simulate what is actually on disk, this allows us to detect +cases where a power failure at a particular point in time would create an +inconsistent file system. + +Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as +they complete as those requests will obviously bypass the device cache. + +Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would +have all the DISCARD requests, and then the WRITE requests and then the FLUSH +request. Consider the following example: + + WRITE block 1, DISCARD block 1, FLUSH + +If we logged DISCARD when it completed, the replay would look like this: + + DISCARD 1, WRITE 1, FLUSH + +which isn't quite what happened and wouldn't be caught during the log replay. + +Target interface +================ + +i) Constructor + + log-writes + + ============= ============================================== + dev_path Device that all of the IO will go to normally. + log_dev_path Device where the log entries are written to. + ============= ============================================== + +ii) Status + + <#logged entries> + + =========================== ======================== + #logged entries Number of logged entries + highest allocated sector Highest allocated sector + =========================== ======================== + +iii) Messages + + mark + + You can use a dmsetup message to set an arbitrary mark in a log. + For example say you want to fsck a file system after every + write, but first you need to replay up to the mkfs to make sure + we're fsck'ing something reasonable, you would do something like + this:: + + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + + This would allow you to replay the log up to the mkfs mark and + then replay from that point on doing the fsck check in the + interval that you want. + + Every log has a mark at the end labeled "dm-log-writes-end". + +Userspace component +=================== + +There is a userspace tool that will replay the log for you in various ways. +It can be found here: https://github.com/josefbacik/log-writes + +Example usage +============= + +Say you want to test fsync on your file system. You would do something like +this:: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + dmsetup message log 0 mark fsync + md5sum /mnt/btrfs-test/foo + umount /mnt/btrfs-test + + dmsetup remove log + replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync + mount /dev/sdb /mnt/btrfs-test + md5sum /mnt/btrfs-test/foo + + + Another option is to do a complicated file system operation and verify the file + system is consistent during the entire operation. You could do this with: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + + btrfs filesystem balance /mnt/btrfs-test + umount /mnt/btrfs-test + dmsetup remove log + + replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs + btrfsck /dev/sdb + replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ + --fsck "btrfsck /dev/sdb" --check fua + +And that will replay the log until it sees a FUA request, run the fsck command +and if the fsck passes it will replay to the next FUA, until it is completed or +the fsck command exists abnormally. diff --git a/Documentation/admin-guide/device-mapper/persistent-data.rst b/Documentation/admin-guide/device-mapper/persistent-data.rst new file mode 100644 index 000000000000..2065c3c5a091 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/persistent-data.rst @@ -0,0 +1,88 @@ +=============== +Persistent data +=============== + +Introduction +============ + +The more-sophisticated device-mapper targets require complex metadata +that is managed in kernel. In late 2010 we were seeing that various +different targets were rolling their own data structures, for example: + +- Mikulas Patocka's multisnap implementation +- Heinz Mauelshagen's thin provisioning target +- Another btree-based caching target posted to dm-devel +- Another multi-snapshot target based on a design of Daniel Phillips + +Maintaining these data structures takes a lot of work, so if possible +we'd like to reduce the number. + +The persistent-data library is an attempt to provide a re-usable +framework for people who want to store metadata in device-mapper +targets. It's currently used by the thin-provisioning target and an +upcoming hierarchical storage target. + +Overview +======== + +The main documentation is in the header files which can all be found +under drivers/md/persistent-data. + +The block manager +----------------- + +dm-block-manager.[hc] + +This provides access to the data on disk in fixed sized-blocks. There +is a read/write locking interface to prevent concurrent accesses, and +keep data that is being used in the cache. + +Clients of persistent-data are unlikely to use this directly. + +The transaction manager +----------------------- + +dm-transaction-manager.[hc] + +This restricts access to blocks and enforces copy-on-write semantics. +The only way you can get hold of a writable block through the +transaction manager is by shadowing an existing block (ie. doing +copy-on-write) or allocating a fresh one. Shadowing is elided within +the same transaction so performance is reasonable. The commit method +ensures that all data is flushed before it writes the superblock. +On power failure your metadata will be as it was when last committed. + +The Space Maps +-------------- + +dm-space-map.h +dm-space-map-metadata.[hc] +dm-space-map-disk.[hc] + +On-disk data structures that keep track of reference counts of blocks. +Also acts as the allocator of new blocks. Currently two +implementations: a simpler one for managing blocks on a different +device (eg. thinly-provisioned data blocks); and one for managing +the metadata space. The latter is complicated by the need to store +its own data within the space it's managing. + +The data structures +------------------- + +dm-btree.[hc] +dm-btree-remove.c +dm-btree-spine.c +dm-btree-internal.h + +Currently there is only one data structure, a hierarchical btree. +There are plans to add more. For example, something with an +array-like interface would see a lot of use. + +The btree is 'hierarchical' in that you can define it to be composed +of nested btrees, and take multiple keys. For example, the +thin-provisioning target uses a btree with two levels of nesting. +The first maps a device id to a mapping tree, and that in turn maps a +virtual block to a physical block. + +Values stored in the btrees can have arbitrary size. Keys are always +64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/admin-guide/device-mapper/snapshot.rst b/Documentation/admin-guide/device-mapper/snapshot.rst new file mode 100644 index 000000000000..ccdd8b587a74 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/snapshot.rst @@ -0,0 +1,196 @@ +============================== +Device-mapper snapshot support +============================== + +Device-mapper allows you, without massive data copying: + +- To create snapshots of any block device i.e. mountable, saved states of + the block device which are also writable without interfering with the + original content; +- To create device "forks", i.e. multiple different versions of the + same data stream. +- To merge a snapshot of a block device back into the snapshot's origin + device. + +In the first two cases, dm copies only the chunks of data that get +changed and uses a separate copy-on-write (COW) block device for +storage. + +For snapshot merge the contents of the COW storage are merged back into +the origin device. + + +There are three dm targets available: +snapshot, snapshot-origin, and snapshot-merge. + +- snapshot-origin + +which will normally have one or more snapshots based on it. +Reads will be mapped directly to the backing device. For each write, the +original data will be saved in the of each snapshot to keep +its visible content unchanged, at least until the fills up. + + +- snapshot + [<# feature args> []*] + +A snapshot of the block device is created. Changed chunks of + sectors will be stored on the . Writes will +only go to the . Reads will come from the or +from for unchanged data. will often be +smaller than the origin and if it fills up the snapshot will become +useless and be disabled, returning errors. So it is important to monitor +the amount of free space and expand the before it fills up. + + is P (Persistent) or N (Not persistent - will not survive +after reboot). O (Overflow) can be added as a persistent store option +to allow userspace to advertise its support for seeing "Overflow" in the +snapshot status. So supported store types are "P", "PO" and "N". + +The difference between persistent and transient is with transient +snapshots less metadata must be saved on disk - they can be kept in +memory by the kernel. + +When loading or unloading the snapshot target, the corresponding +snapshot-origin or snapshot-merge target must be suspended. A failure to +suspend the origin target could result in data corruption. + +Optional features: + + discard_zeroes_cow - a discard issued to the snapshot device that + maps to entire chunks to will zero the corresponding exception(s) in + the snapshot's exception store. + + discard_passdown_origin - a discard to the snapshot device is passed + down to the snapshot-origin's underlying device. This doesn't cause + copy-out to the snapshot exception store because the snapshot-origin + target is bypassed. + + The discard_passdown_origin feature depends on the discard_zeroes_cow + feature being enabled. + + +- snapshot-merge + [<# feature args> []*] + +takes the same table arguments as the snapshot target except it only +works with persistent snapshots. This target assumes the role of the +"snapshot-origin" target and must not be loaded if the "snapshot-origin" +is still present for . + +Creates a merging snapshot that takes control of the changed chunks +stored in the of an existing snapshot, through a handover +procedure, and merges these chunks back into the . Once merging +has started (in the background) the may be opened and the merge +will continue while I/O is flowing to it. Changes to the are +deferred until the merging snapshot's corresponding chunk(s) have been +merged. Once merging has started the snapshot device, associated with +the "snapshot" target, will return -EIO when accessed. + + +How snapshot is used by LVM2 +============================ +When you create the first LVM2 snapshot of a volume, four dm devices are used: + +1) a device containing the original mapping table of the source volume; +2) a device used as the ; +3) a "snapshot" device, combining #1 and #2, which is the visible snapshot + volume; +4) the "original" volume (which uses the device number used by the original + source volume), whose table is replaced by a "snapshot-origin" mapping + from device #1. + +A fixed naming scheme is used, so with the following commands:: + + lvcreate -L 1G -n base volumeGroup + lvcreate -L 100M --snapshot -n snap volumeGroup/base + +we'll have this situation (with volumes in above order):: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 + volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 + volumeGroup-base: 0 2097152 snapshot-origin 254:11 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow + brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap + brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + + +How snapshot-merge is used by LVM2 +================================== +A merging snapshot assumes the role of the "snapshot-origin" while +merging. As such the "snapshot-origin" is replaced with +"snapshot-merge". The "-real" device is not changed and the "-cow" +device is renamed to -cow to aid LVM2's cleanup of the +merging snapshot after it completes. The "snapshot" that hands over its +COW device to the "snapshot-merge" is deactivated (unless using lvchange +--refresh); but if it is left active it will simply return I/O errors. + +A snapshot will merge into its origin with the following command:: + + lvconvert --merge volumeGroup/snap + +we'll now have this situation:: + + # dmsetup table|grep volumeGroup + + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-base-cow: 0 204800 linear 8:19 2097536 + volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow + brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base + + +How to determine when a merging is complete +=========================================== +The snapshot-merge and snapshot status lines end with: + + / + +Both and include both data and metadata. +During merging, the number of sectors allocated gets smaller and +smaller. Merging has finished when the number of sectors holding data +is zero, in other words == . + +Here is a practical example (using a hybrid of lvm and dmsetup commands):: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g + snap volumeGroup swi-a- 1.00g base 18.97 + + # dmsetup status volumeGroup-snap + 0 8388608 snapshot 397896/2097152 1560 + ^^^^ metadata sectors + + # lvconvert --merge -b volumeGroup/snap + Merging of volume snap started. + + # lvs volumeGroup/snap + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup Owi-a- 4.00g 17.23 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 281688/2097152 1104 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 180480/2097152 712 + + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 16/2097152 16 + +Merging has finished. + +:: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst new file mode 100644 index 000000000000..3d80a9f850cc --- /dev/null +++ b/Documentation/admin-guide/device-mapper/statistics.rst @@ -0,0 +1,225 @@ +============= +DM statistics +============= + +Device Mapper supports the collection of I/O statistics on user-defined +regions of a DM device. If no regions are defined no statistics are +collected so there isn't any performance impact. Only bio-based DM +devices are currently supported. + +Each user-defined region specifies a starting sector, length and step. +Individual statistics will be collected for each step-sized area within +the range specified. + +The I/O statistics counters for each step-sized area of a region are +in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: +Documentation/iostats.txt). But two extra counters (12 and 13) are +provided: total time spent reading and writing. When the histogram +argument is used, the 14th parameter is reported that represents the +histogram of latencies. All these counters may be accessed by sending +the @stats_print message to the appropriate DM device via dmsetup. + +The reported times are in milliseconds and the granularity depends on +the kernel ticks. When the option precise_timestamps is used, the +reported times are in nanoseconds. + +Each region has a corresponding unique identifier, which we call a +region_id, that is assigned when the region is created. The region_id +must be supplied when querying statistics about the region, deleting the +region, etc. Unique region_ids enable multiple userspace programs to +request and process statistics for the same DM device without stepping +on each other's data. + +The creation of DM statistics will allocate memory via kmalloc or +fallback to using vmalloc space. At most, 1/4 of the overall system +memory may be allocated by DM statistics. The admin can see how much +memory is used by reading: + + /sys/module/dm_mod/parameters/stats_current_allocated_bytes + +Messages +======== + + @stats_create [ ...] [ []] + Create a new region and return the region_id. + + + "-" + whole device + "+" + a range of 512-byte sectors + starting with . + + + "" + the range is subdivided into areas each containing + sectors. + "/" + the range is subdivided into the specified + number of areas. + + + The number of optional arguments + + + The following optional arguments are supported: + + precise_timestamps + use precise timer with nanosecond resolution + instead of the "jiffies" variable. When this argument is + used, the resulting times are in nanoseconds instead of + milliseconds. Precise timestamps are a little bit slower + to obtain than jiffies-based timestamps. + histogram:n1,n2,n3,n4,... + collect histogram of latencies. The + numbers n1, n2, etc are times that represent the boundaries + of the histogram. If precise_timestamps is not used, the + times are in milliseconds, otherwise they are in + nanoseconds. For each range, the kernel will report the + number of requests that completed within this range. For + example, if we use "histogram:10,20,30", the kernel will + report four numbers a:b:c:d. a is the number of requests + that took 0-10 ms to complete, b is the number of requests + that took 10-20 ms to complete, c is the number of requests + that took 20-30 ms to complete and d is the number of + requests that took more than 30 ms to complete. + + + An optional parameter. A name that uniquely identifies + the userspace owner of the range. This groups ranges together + so that userspace programs can identify the ranges they + created and ignore those created by others. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use it for anything else. + If we omit the number of optional arguments, program id must not + be a number, otherwise it would be interpreted as the number of + optional arguments. + + + An optional parameter. A word that provides auxiliary data + that is useful to the client program that created the range. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use this value for anything. + + @stats_delete + Delete the region with the specified id. + + + region_id returned from @stats_create + + @stats_clear + Clear all the counters except the in-flight i/o counters. + + + region_id returned from @stats_create + + @stats_list [] + List all regions registered with @stats_create. + + + An optional parameter. + If this parameter is specified, only matching regions + are returned. + If it is not specified, all regions are returned. + + Output format: + : + + precise_timestamps histogram:n1,n2,n3,... + + The strings "precise_timestamps" and "histogram" are printed only + if they were specified when creating the region. + + @stats_print [ ] + Print counters for each step-sized area of a region. + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are returned. + + + The number of lines to include in the output. + If omitted, all lines are returned. + + Output format for each step-sized area of a region: + + + + counters + + The first 11 counters have the same meaning as + `/sys/block/*/stat or /proc/diskstats`. + + Please refer to Documentation/iostats.txt for details. + + 1. the number of reads completed + 2. the number of reads merged + 3. the number of sectors read + 4. the number of milliseconds spent reading + 5. the number of writes completed + 6. the number of writes merged + 7. the number of sectors written + 8. the number of milliseconds spent writing + 9. the number of I/Os currently in progress + 10. the number of milliseconds spent doing I/Os + 11. the weighted number of milliseconds spent doing I/Os + + Additional counters: + + 12. the total time spent reading in milliseconds + 13. the total time spent writing in milliseconds + + @stats_print_clear [ ] + Atomically print and then clear all the counters except the + in-flight i/o counters. Useful when the client consuming the + statistics does not want to lose any statistics (those updated + between printing and clearing). + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are printed and then cleared. + + + The number of lines to process. + If omitted, all lines are printed and then cleared. + + @stats_set_aux + Store auxiliary data aux_data for the specified region. + + + region_id returned from @stats_create + + + The string that identifies data which is useful to the client + program that created the range. The kernel returns this + string back in the output of @stats_list message, but it + doesn't use this value for anything. + +Examples +======== + +Subdivide the DM device 'vol' into 100 pieces and start collecting +statistics on them:: + + dmsetup message vol 0 @stats_create - /100 + +Set the auxiliary data string to "foo bar baz" (the escape for each +space must also be escaped, otherwise the shell will consume them):: + + dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz + +List the statistics:: + + dmsetup message vol 0 @stats_list + +Print the statistics:: + + dmsetup message vol 0 @stats_print 0 + +Delete the statistics:: + + dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/admin-guide/device-mapper/striped.rst b/Documentation/admin-guide/device-mapper/striped.rst new file mode 100644 index 000000000000..e9a8da192ae1 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/striped.rst @@ -0,0 +1,61 @@ +========= +dm-stripe +========= + +Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) +device across one or more underlying devices. Data is written in "chunks", +with consecutive chunks rotating among the underlying devices. This can +potentially provide improved I/O throughput by utilizing several physical +devices in parallel. + +Parameters: [ ]+ + : + Number of underlying devices. + : + Size of each chunk of data. Must be at least as + large as the system's PAGE_SIZE. + : + Full pathname to the underlying block-device, or a + "major:minor" device-number. + : + Starting sector within the device. + +One or more underlying devices can be specified. The striped device size must +be a multiple of the chunk size multiplied by the number of underlying devices. + + +Example scripts +=============== + +:: + + #!/usr/bin/perl -w + # Create a striped device across any number of underlying devices. The device + # will be called "stripe_dev" and have a chunk-size of 128k. + + my $chunk_size = 128 * 2; + my $dev_name = "stripe_dev"; + my $num_devs = @ARGV; + my @devs = @ARGV; + my ($min_dev_size, $stripe_dev_size, $i); + + if (!$num_devs) { + die("Specify at least one device\n"); + } + + $min_dev_size = `blockdev --getsz $devs[0]`; + for ($i = 1; $i < $num_devs; $i++) { + my $this_size = `blockdev --getsz $devs[$i]`; + $min_dev_size = ($min_dev_size < $this_size) ? + $min_dev_size : $this_size; + } + + $stripe_dev_size = $min_dev_size * $num_devs; + $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); + + $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; + for ($i = 0; $i < $num_devs; $i++) { + $table .= " $devs[$i] 0"; + } + + `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/admin-guide/device-mapper/switch.rst b/Documentation/admin-guide/device-mapper/switch.rst new file mode 100644 index 000000000000..7dde06be1a4f --- /dev/null +++ b/Documentation/admin-guide/device-mapper/switch.rst @@ -0,0 +1,141 @@ +========= +dm-switch +========= + +The device-mapper switch target creates a device that supports an +arbitrary mapping of fixed-size regions of I/O across a fixed set of +paths. The path used for any specific region can be switched +dynamically by sending the target a message. + +It maps I/O to underlying block devices efficiently when there is a large +number of fixed-sized address regions but there is no simple pattern +that would allow for a compact representation of the mapping such as +dm-stripe. + +Background +---------- + +Dell EqualLogic and some other iSCSI storage arrays use a distributed +frameless architecture. In this architecture, the storage group +consists of a number of distinct storage arrays ("members") each having +independent controllers, disk storage and network adapters. When a LUN +is created it is spread across multiple members. The details of the +spreading are hidden from initiators connected to this storage system. +The storage group exposes a single target discovery portal, no matter +how many members are being used. When iSCSI sessions are created, each +session is connected to an eth port on a single member. Data to a LUN +can be sent on any iSCSI session, and if the blocks being accessed are +stored on another member the I/O will be forwarded as required. This +forwarding is invisible to the initiator. The storage layout is also +dynamic, and the blocks stored on disk may be moved from member to +member as needed to balance the load. + +This architecture simplifies the management and configuration of both +the storage group and initiators. In a multipathing configuration, it +is possible to set up multiple iSCSI sessions to use multiple network +interfaces on both the host and target to take advantage of the +increased network bandwidth. An initiator could use a simple round +robin algorithm to send I/O across all paths and let the storage array +members forward it as necessary, but there is a performance advantage to +sending data directly to the correct member. + +A device-mapper table already lets you map different regions of a +device onto different targets. However in this architecture the LUN is +spread with an address region size on the order of 10s of MBs, which +means the resulting table could have more than a million entries and +consume far too much memory. + +Using this device-mapper switch target we can now build a two-layer +device hierarchy: + + Upper Tier - Determine which array member the I/O should be sent to. + Lower Tier - Load balance amongst paths to a particular member. + +The lower tier consists of a single dm multipath device for each member. +Each of these multipath devices contains the set of paths directly to +the array member in one priority group, and leverages existing path +selectors to load balance amongst these paths. We also build a +non-preferred priority group containing paths to other array members for +failover reasons. + +The upper tier consists of a single dm-switch device. This device uses +a bitmap to look up the location of the I/O and choose the appropriate +lower tier device to route the I/O. By using a bitmap we are able to +use 4 bits for each address range in a 16 member group (which is very +large for us). This is a much denser representation than the dm table +b-tree can achieve. + +Construction Parameters +======================= + + [...] [ ]+ + + The number of paths across which to distribute the I/O. + + + The number of 512-byte sectors in a region. Each region can be redirected + to any of the available paths. + + + The number of optional arguments. Currently, no optional arguments + are supported and so this must be zero. + + + The block device that represents a specific path to the device. + + + The offset of the start of data on the specific (in units + of 512-byte sectors). This number is added to the sector number when + forwarding the request to the specific path. Typically it is zero. + +Messages +======== + +set_region_mappings : []: []:... + +Modify the region table by specifying which regions are redirected to +which paths. + + + The region number (region size was specified in constructor parameters). + If index is omitted, the next region (previous index + 1) is used. + Expressed in hexadecimal (WITHOUT any prefix like 0x). + + + The path number in the range 0 ... ( - 1). + Expressed in hexadecimal (WITHOUT any prefix like 0x). + +R, + This parameter allows repetitive patterns to be loaded quickly. and + are hexadecimal numbers. The last mappings are repeated in the next + slots. + +Status +====== + +No status line is reported. + +Example +======= + +Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with +the same size. + +Create a switch device with 64kB region size:: + + dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` + switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" + +Set mappings for the first 7 entries to point to devices switch0, switch1, +switch2, switch0, switch1, switch2, switch1:: + + dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 + +Set repetitive mapping. This command:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 + +is equivalent to:: + + dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ + :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 diff --git a/Documentation/admin-guide/device-mapper/thin-provisioning.rst b/Documentation/admin-guide/device-mapper/thin-provisioning.rst new file mode 100644 index 000000000000..bafebf79da4b --- /dev/null +++ b/Documentation/admin-guide/device-mapper/thin-provisioning.rst @@ -0,0 +1,427 @@ +================= +Thin provisioning +================= + +Introduction +============ + +This document describes a collection of device-mapper targets that +between them implement thin-provisioning and snapshots. + +The main highlight of this implementation, compared to the previous +implementation of snapshots, is that it allows many virtual devices to +be stored on the same data volume. This simplifies administration and +allows the sharing of data between volumes, thus reducing disk usage. + +Another significant feature is support for an arbitrary depth of +recursive snapshots (snapshots of snapshots of snapshots ...). The +previous implementation of snapshots did this by chaining together +lookup tables, and so performance was O(depth). This new +implementation uses a single data structure to avoid this degradation +with depth. Fragmentation may still be an issue, however, in some +scenarios. + +Metadata is stored on a separate device from data, giving the +administrator some freedom, for example to: + +- Improve metadata resilience by storing metadata on a mirrored volume + but data on a non-mirrored one. + +- Improve performance by storing the metadata on SSD. + +Status +====== + +These targets are considered safe for production use. But different use +cases will have different performance characteristics, for example due +to fragmentation of the data volume. + +If you find this software is not performing as expected please mail +dm-devel@redhat.com with details and we'll try our best to improve +things for you. + +Userspace tools for checking and repairing the metadata have been fully +developed and are available as 'thin_check' and 'thin_repair'. The name +of the package that provides these utilities varies by distribution (on +a Red Hat distribution it is named 'device-mapper-persistent-data'). + +Cookbook +======== + +This section describes some quick recipes for using thin provisioning. +They use the dmsetup program to control the device-mapper driver +directly. End users will be advised to use a higher-level volume +manager such as LVM2 once support has been added. + +Pool device +----------- + +The pool device ties together the metadata volume and the data volume. +It maps I/O linearly to the data volume and updates the metadata via +two mechanisms: + +- Function calls from the thin targets + +- Device-mapper 'messages' from userspace which control the creation of new + virtual devices amongst other things. + +Setting up a fresh pool device +------------------------------ + +Setting up a pool device requires a valid metadata device, and a +data device. If you do not have an existing metadata device you can +make one by zeroing the first 4k to indicate empty metadata. + + dd if=/dev/zero of=$metadata_dev bs=4096 count=1 + +The amount of metadata you need will vary according to how many blocks +are shared between thin devices (i.e. through snapshots). If you have +less sharing than average you'll need a larger-than-average metadata device. + +As a guide, we suggest you calculate the number of bytes to use in the +metadata device as 48 * $data_dev_size / $data_block_size but round it up +to 2MB if the answer is smaller. If you're creating large numbers of +snapshots which are recording large amounts of change, you may find you +need to increase this. + +The largest size supported is 16GB: If the device is larger, +a warning will be issued and the excess space will not be used. + +Reloading a pool table +---------------------- + +You may reload a pool's table, indeed this is how the pool is resized +if it runs out of space. (N.B. While specifying a different metadata +device when reloading is not forbidden at the moment, things will go +wrong if it does not route I/O to exactly the same on-disk location as +previously.) + +Using an existing pool device +----------------------------- + +:: + + dmsetup create pool \ + --table "0 20971520 thin-pool $metadata_dev $data_dev \ + $data_block_size $low_water_mark" + +$data_block_size gives the smallest unit of disk space that can be +allocated at a time expressed in units of 512-byte sectors. +$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a +multiple of 128 (64KB). $data_block_size cannot be changed after the +thin-pool is created. People primarily interested in thin provisioning +may want to use a value such as 1024 (512KB). People doing lots of +snapshotting may want a smaller value such as 128 (64KB). If you are +not zeroing newly-allocated data, a larger $data_block_size in the +region of 256000 (128MB) is suggested. + +$low_water_mark is expressed in blocks of size $data_block_size. If +free space on the data device drops below this level then a dm event +will be triggered which a userspace daemon should catch allowing it to +extend the pool device. Only one such event will be sent. + +No special event is triggered if a just resumed device's free space is below +the low water mark. However, resuming a device always triggers an +event; a userspace daemon should verify that free space exceeds the low +water mark when handling this event. + +A low water mark for the metadata device is maintained in the kernel and +will trigger a dm event if free space on the metadata device drops below +it. + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second. This +means the thin-provisioning target behaves like a physical disk that has +a volatile write cache. If power is lost you may lose some recent +writes. The metadata should always be consistent in spite of any crash. + +If data space is exhausted the pool will either error or queue IO +according to the configuration (see: error_if_no_space). If metadata +space is exhausted or a metadata operation fails: the pool will error IO +until the pool is taken offline and repair is performed to 1) fix any +potential inconsistencies and 2) clear the flag that imposes repair. +Once the pool's metadata device is repaired it may be resized, which +will allow the pool to return to normal operation. Note that if a pool +is flagged as needing repair, the pool's data and metadata devices +cannot be resized until repair is performed. It should also be noted +that when the pool's metadata space is exhausted the current metadata +transaction is aborted. Given that the pool will cache IO whose +completion may have already been acknowledged to upper IO layers +(e.g. filesystem) it is strongly suggested that consistency checks +(e.g. fsck) be performed on those layers when repair of the pool is +required. + +Thin provisioning +----------------- + +i) Creating a new thinly-provisioned volume. + + To create a new thinly- provisioned volume you must send a message to an + active pool device, /dev/mapper/pool in this example:: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + + Here '0' is an identifier for the volume, a 24-bit number. It's up + to the caller to allocate and manage these identifiers. If the + identifier is already in use, the message will fail with -EEXIST. + +ii) Using a thinly-provisioned volume. + + Thinly-provisioned volumes are activated using the 'thin' target:: + + dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" + + The last parameter is the identifier for the thinp device. + +Internal snapshots +------------------ + +i) Creating an internal snapshot. + + Snapshots are created with another message to the pool. + + N.B. If the origin device that you wish to snapshot is active, you + must suspend it before creating the snapshot to avoid corruption. + This is NOT enforced at the moment, so please be careful! + + :: + + dmsetup suspend /dev/mapper/thin + dmsetup message /dev/mapper/pool 0 "create_snap 1 0" + dmsetup resume /dev/mapper/thin + + Here '1' is the identifier for the volume, a 24-bit number. '0' is the + identifier for the origin device. + +ii) Using an internal snapshot. + + Once created, the user doesn't have to worry about any connection + between the origin and the snapshot. Indeed the snapshot is no + different from any other thinly-provisioned device and can be + snapshotted itself via the same method. It's perfectly legal to + have only one of them active, and there's no ordering requirement on + activating or removing them both. (This differs from conventional + device-mapper snapshots.) + + Activate it exactly the same way as any other thinly-provisioned volume:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" + +External snapshots +------------------ + +You can use an external **read only** device as an origin for a +thinly-provisioned volume. Any read to an unprovisioned area of the +thin device will be passed through to the origin. Writes trigger +the allocation of new blocks as usual. + +One use case for this is VM hosts that want to run guests on +thinly-provisioned volumes but have the base image on another device +(possibly shared between many VMs). + +You must not write to the origin device if you use this technique! +Of course, you may write to the thin device and take internal snapshots +of the thin volume. + +i) Creating a snapshot of an external device + + This is the same as creating a thin device. + You don't mention the origin at this stage. + + :: + + dmsetup message /dev/mapper/pool 0 "create_thin 0" + +ii) Using a snapshot of an external device. + + Append an extra parameter to the thin target specifying the origin:: + + dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" + + N.B. All descendants (internal snapshots) of this snapshot require the + same extra origin parameter. + +Deactivation +------------ + +All devices using a pool must be deactivated before the pool itself +can be. + +:: + + dmsetup remove thin + dmsetup remove snap + dmsetup remove pool + +Reference +========= + +'thin-pool' target +------------------ + +i) Constructor + + :: + + thin-pool \ + [ []*] + + Optional feature arguments: + + skip_block_zeroing: + Skip the zeroing of newly-provisioned blocks. + + ignore_discard: + Disable discard support. + + no_discard_passdown: + Don't pass discards down to the underlying + data device, but just remove the mapping. + + read_only: + Don't allow any changes to be made to the pool + metadata. This mode is only available after the + thin-pool has been created and first used in full + read/write mode. It cannot be specified on initial + thin-pool creation. + + error_if_no_space: + Error IOs, instead of queueing, if no space. + + Data block size must be between 64KB (128 sectors) and 1GB + (2097152 sectors) inclusive. + + +ii) Status + + :: + + / + / + ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space + needs_check|- metadata_low_watermark + + transaction id: + A 64-bit number used by userspace to help synchronise with metadata + from volume managers. + + used data blocks / total data blocks + If the number of free blocks drops below the pool's low water mark a + dm event will be sent to userspace. This event is edge-triggered and + it will occur only once after each resume so volume manager writers + should register for the event and then check the target's status. + + held metadata root: + The location, in blocks, of the metadata root that has been + 'held' for userspace read access. '-' indicates there is no + held root. + + discard_passdown|no_discard_passdown + Whether or not discards are actually being passed down to the + underlying device. When this is enabled when loading the table, + it can get disabled if the underlying device doesn't support it. + + ro|rw|out_of_data_space + If the pool encounters certain types of device failures it will + drop into a read-only metadata mode in which no changes to + the pool metadata (like allocating new blocks) are permitted. + + In serious cases where even a read-only mode is deemed unsafe + no further I/O will be permitted and the status will just + contain the string 'Fail'. The userspace recovery tools + should then be used. + + error_if_no_space|queue_if_no_space + If the pool runs out of data or metadata space, the pool will + either queue or error the IO destined to the data device. The + default is to queue the IO until more space is added or the + 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool + module parameter can be used to change this timeout -- it + defaults to 60 seconds but may be disabled using a value of 0. + + needs_check + A metadata operation has failed, resulting in the needs_check + flag being set in the metadata's superblock. The metadata + device must be deactivated and checked/repaired before the + thin-pool can be made fully operational again. '-' indicates + needs_check is not set. + + metadata_low_watermark: + Value of metadata low watermark in blocks. The kernel sets this + value internally but userspace needs to know this value to + determine if an event was caused by crossing this threshold. + +iii) Messages + + create_thin + Create a new thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + + create_snap + Create a new snapshot of another thinly-provisioned device. + is an arbitrary unique 24-bit identifier chosen by + the caller. + is the identifier of the thinly-provisioned device + of which the new device will be a snapshot. + + delete + Deletes a thin device. Irreversible. + + set_transaction_id + Userland volume managers, such as LVM, need a way to + synchronise their external metadata with the internal metadata of the + pool target. The thin-pool target offers to store an + arbitrary 64-bit transaction id and return it on the target's + status line. To avoid races you must provide what you think + the current transaction id is when you change it with this + compare-and-swap message. + + reserve_metadata_snap + Reserve a copy of the data mapping btree for use by userland. + This allows userland to inspect the mappings as they were when + this message was executed. Use the pool's status command to + get the root block associated with the metadata snapshot. + + release_metadata_snap + Release a previously reserved copy of the data mapping btree. + +'thin' target +------------- + +i) Constructor + + :: + + thin [] + + pool dev: + the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 + + dev id: + the internal device identifier of the device to be + activated. + + external origin dev: + an optional block device outside the pool to be treated as a + read-only snapshot origin: reads to unprovisioned areas of the + thin target will be mapped to this device. + +The pool doesn't store any size against the thin devices. If you +load a thin target that is smaller than you've been using previously, +then you'll have no access to blocks mapped beyond the end. If you +load a target that is bigger than before, then extra blocks will be +provisioned as and when needed. + +ii) Status + + + If the pool has encountered device errors and failed, the status + will just contain the string 'Fail'. The userspace recovery + tools should then be used. + + In the case where is 0, there is no highest + mapped sector and the value of is unspecified. diff --git a/Documentation/admin-guide/device-mapper/unstriped.rst b/Documentation/admin-guide/device-mapper/unstriped.rst new file mode 100644 index 000000000000..0a8d3eb3f072 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/unstriped.rst @@ -0,0 +1,135 @@ +================================ +Device-mapper "unstriped" target +================================ + +Introduction +============ + +The device-mapper "unstriped" target provides a transparent mechanism to +unstripe a device-mapper "striped" target to access the underlying disks +without having to touch the true backing block-device. It can also be +used to unstripe a hardware RAID-0 to access backing disks. + +Parameters: + + + + The number of stripes in the RAID 0. + + + The amount of 512B sectors in the chunk striping. + + + The block device you wish to unstripe. + + + The stripe number within the device that corresponds to physical + drive you wish to unstripe. This must be 0 indexed. + + +Why use this module? +==================== + +An example of undoing an existing dm-stripe +------------------------------------------- + +This small bash script will setup 4 loop devices and use the existing +striped target to combine the 4 devices into one. It then will use +the unstriped target ontop of the striped device to access the +individual backing loop devices. We write data to the newly exposed +unstriped devices and verify the data written matches the correct +underlying device on the striped array:: + + #!/bin/bash + + MEMBER_SIZE=$((128 * 1024 * 1024)) + NUM=4 + SEQ_END=$((${NUM}-1)) + CHUNK=256 + BS=4096 + + RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) + DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" + COUNT=$((${MEMBER_SIZE} / ${BS})) + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct + losetup /dev/loop${i} member-${i} + DM_PARMS+=" /dev/loop${i} 0" + done + + echo $DM_PARMS | dmsetup create raid0 + for i in $(seq 0 ${SEQ_END}); do + echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct + diff /dev/mapper/set-${i} member-${i} + done; + + for i in $(seq 0 ${SEQ_END}); do + dmsetup remove set-${i} + done + + dmsetup remove raid0 + + for i in $(seq 0 ${SEQ_END}); do + losetup -d /dev/loop${i} + rm -f member-${i} + done + +Another example +--------------- + +Intel NVMe drives contain two cores on the physical device. +Each core of the drive has segregated access to its LBA range. +The current LBA model has a RAID 0 128k chunk on each core, resulting +in a 256k stripe across the two cores:: + + Core 0: Core 1: + __________ __________ + | LBA 512| | LBA 768| + | LBA 0 | | LBA 256| + ---------- ---------- + +The purpose of this unstriping is to provide better QoS in noisy +neighbor environments. When two partitions are created on the +aggregate drive without this unstriping, reads on one partition +can affect writes on another partition. This is because the partitions +are striped across the two cores. When we unstripe this hardware RAID 0 +and make partitions on each new exposed device the two partitions are now +physically separated. + +With the dm-unstriped target we're able to segregate an fio script that +has read and write jobs that are independent of each other. Compared to +when we run the test on a combined drive with partitions, we were able +to get a 92% reduction in read latency using this device mapper target. + + +Example dmsetup usage +===================== + +unstriped ontop of Intel NVMe device that has 2 cores +----------------------------------------------------- + +:: + + dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' + dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' + +There will now be two devices that expose Intel NVMe core 0 and 1 +respectively:: + + /dev/mapper/nvmset0 + /dev/mapper/nvmset1 + +unstriped ontop of striped with 4 drives using 128K chunk size +-------------------------------------------------------------- + +:: + + dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' + dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' + dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' + dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst new file mode 100644 index 000000000000..a4d1c1476d72 --- /dev/null +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -0,0 +1,229 @@ +========= +dm-verity +========= + +Device-Mapper's "verity" target provides transparent integrity checking of +block devices using a cryptographic digest provided by the kernel crypto API. +This target is read-only. + +Construction Parameters +======================= + +:: + + + + + + [<#opt_params> ] + + + This is the type of the on-disk hash format. + + 0 is the original format used in the Chromium OS. + The salt is appended when hashing, digests are stored continuously and + the rest of the block is padded with zeroes. + + 1 is the current format that should be used for new devices. + The salt is prepended when hashing and each digest is + padded with zeroes to the power of two. + + + This is the device containing data, the integrity of which needs to be + checked. It may be specified as a path, like /dev/sdaX, or a device number, + :. + + + This is the device that supplies the hash tree data. It may be + specified similarly to the device path and may be the same device. If the + same device is used, the hash_start should be outside the configured + dm-verity device. + + + The block size on a data device in bytes. + Each block corresponds to one digest on the hash device. + + + The size of a hash block in bytes. + + + The number of data blocks on the data device. Additional blocks are + inaccessible. You can place hashes to the same partition as data, in this + case hashes are placed after . + + + This is the offset, in -blocks, from the start of hash_dev + to the root block of the hash tree. + + + The cryptographic hash algorithm used for this device. This should + be the name of the algorithm, like "sha1". + + + The hexadecimal encoding of the cryptographic hash of the root hash block + and the salt. This hash should be trusted as there is no other authenticity + beyond this point. + + + The hexadecimal encoding of the salt value. + +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 ignore_corruption + +ignore_corruption + Log corrupted blocks, but allow read operations to proceed normally. + +restart_on_corruption + Restart the system when a corrupted block is discovered. This option is + not compatible with ignore_corruption and requires user space support to + avoid restart loops. + +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + +check_at_most_once + Verify data blocks only the first time they are read from the data device, + rather than every time. This reduces the overhead of dm-verity so that it + can be used on systems that are memory and/or CPU constrained. However, it + provides a reduced level of security because only offline tampering of the + data device's content will be detected, not online tampering. + + Hash blocks are still verified each time they are read from the hash device, + since verification of hash blocks is less performance critical than data + blocks, and a hash block will not be verified any more after all the data + blocks it covers have been verified anyway. + +Theory of operation +=================== + +dm-verity is meant to be set up as part of a verified boot path. This +may be anything ranging from a boot using tboot or trustedgrub to just +booting from a known-good device (like a USB drive or CD). + +When a dm-verity device is configured, it is expected that the caller +has been authenticated in some way (cryptographic signatures, etc). +After instantiation, all hashes will be verified on-demand during +disk access. If they cannot be verified up to the root node of the +tree, the root hash, then the I/O will fail. This should detect +tampering with any data on the device and the hash data. + +Cryptographic hashes are used to assert the integrity of the device on a +per-block basis. This allows for a lightweight hash computation on first read +into the page cache. Block hashes are stored linearly, aligned to the nearest +block size. + +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + +Hash Tree +--------- + +Each node in the tree is a cryptographic hash. If it is a leaf node, the hash +of some data block on disk is calculated. If it is an intermediary node, +the hash of a number of child nodes is calculated. + +Each entry in the tree is a collection of neighboring nodes that fit in one +block. The number is determined based on block_size and the size of the +selected cryptographic digest algorithm. The hashes are linearly-ordered in +this entry and any unaligned trailing space is ignored but included when +calculating the parent node. + +The tree looks something like: + + alg = sha256, num_blocks = 32768, block_size = 4096 + +:: + + [ root ] + / . . . \ + [entry_0] [entry_1] + / . . . \ . . . \ + [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] + / ... \ / . . . \ / \ + blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 + + +On-disk format +============== + +The verity kernel code does not read the verity metadata on-disk header. +It only reads the hash blocks which directly follow the header. +It is expected that a user-space tool will verify the integrity of the +verity header. + +Alternatively, the header can be omitted and the dmsetup parameters can +be passed via the kernel command-line in a rooted chain of trust where +the command-line is verified. + +Directly following the header (and with sector number padded to the next hash +block boundary) are the hash blocks which are stored a depth at a time +(starting from the root), sorted in order of increasing index. + +The full specification of kernel parameters and on-disk metadata format +is available at the cryptsetup project's wiki page + + https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity + +Status +====== +V (for Valid) is returned if every check performed so far was valid. +If any check failed, C (for Corruption) is returned. + +Example +======= +Set up a device:: + + # dmsetup create vroot --readonly --table \ + "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ + "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ + "1234000000000000000000000000000000000000000000000000000000000000" + +A command line tool veritysetup is available to compute or verify +the hash tree or activate the kernel device. This is available from +the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ +(as a libcryptsetup extension). + +Create hash on the device:: + + # veritysetup format /dev/sda1 /dev/sda2 + ... + Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 + +Activate the device:: + + # veritysetup create vroot /dev/sda1 /dev/sda2 \ + 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/admin-guide/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst new file mode 100644 index 000000000000..d3d7690f5e8d --- /dev/null +++ b/Documentation/admin-guide/device-mapper/writecache.rst @@ -0,0 +1,79 @@ +================= +Writecache target +================= + +The writecache target caches writes on persistent memory or on SSD. It +doesn't cache reads because reads are supposed to be cached in page cache +in normal RAM. + +When the device is constructed, the first sector should be zeroed or the +first sector should contain valid superblock from previous invocation. + +Constructor parameters: + +1. type of the cache device - "p" or "s" + + - p - persistent memory + - s - SSD +2. the underlying device that will be cached +3. the cache device +4. block size (4096 is recommended; the maximum block size is the page + size) +5. the number of optional parameters (the parameters with an argument + count as two) + + start_sector n (default: 0) + offset from the start of cache device in 512-byte sectors + high_watermark n (default: 50) + start writeback when the number of used blocks reach this + watermark + low_watermark x (default: 45) + stop writeback when the number of used blocks drops below + this watermark + writeback_jobs n (default: unlimited) + limit the number of blocks that are in flight during + writeback. Setting this value reduces writeback + throughput, but it may improve latency of read requests + autocommit_blocks n (default: 64 for pmem, 65536 for ssd) + when the application writes this amount of blocks without + issuing the FLUSH request, the blocks are automatically + commited + autocommit_time ms (default: 1000) + autocommit time in milliseconds. The data is automatically + commited if this time passes and no FLUSH request is + received + fua (by default on) + applicable only to persistent memory - use the FUA flag + when writing data from persistent memory back to the + underlying device + nofua + applicable only to persistent memory - don't use the FUA + flag when writing back data and send the FLUSH request + afterwards + + - some underlying devices perform better with fua, some + with nofua. The user should test it + +Status: +1. error indicator - 0 if there was no error, otherwise error number +2. the number of blocks +3. the number of free blocks +4. the number of blocks under writeback + +Messages: + flush + flush the cache device. The message returns successfully + if the cache device was flushed without an error + flush_on_suspend + flush the cache device on next suspend. Use this message + when you are going to remove the cache device. The proper + sequence for removing the cache device is: + + 1. send the "flush_on_suspend" message + 2. load an inactive table with a linear target that maps + to the underlying device + 3. suspend the device + 4. ask for status and verify that there are no errors + 5. resume the device, so that it will use the linear + target + 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/admin-guide/device-mapper/zero.rst b/Documentation/admin-guide/device-mapper/zero.rst new file mode 100644 index 000000000000..11fb5cf4597c --- /dev/null +++ b/Documentation/admin-guide/device-mapper/zero.rst @@ -0,0 +1,37 @@ +======= +dm-zero +======= + +Device-Mapper's "zero" target provides a block-device that always returns +zero'd data on reads and silently drops writes. This is similar behavior to +/dev/zero, but as a block-device instead of a character-device. + +Dm-zero has no target-specific parameters. + +One very interesting use of dm-zero is for creating "sparse" devices in +conjunction with dm-snapshot. A sparse device reports a device-size larger +than the amount of actual storage space available for that device. A user can +write data anywhere within the sparse device and read it back like a normal +device. Reads to previously unwritten areas will return a zero'd buffer. When +enough data has been written to fill up the actual storage space, the sparse +device is deactivated. This can be very useful for testing device and +filesystem limitations. + +To create a sparse device, start by creating a dm-zero device that's the +desired size of the sparse device. For this example, we'll assume a 10TB +sparse device:: + + TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors + echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 + +Then create a snapshot of the zero device, using any available block-device as +the COW device. The size of the COW device will determine the amount of real +space available to the sparse device. For this example, we'll assume /dev/sdb1 +is an available 10GB partition:: + + echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ + dmsetup create sparse1 + +This will create a 10TB sparse device called /dev/mapper/sparse1 that has +10GB of actual storage space available. If more than 10GB of data is written +to this device, it will start returning I/O errors. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index abc2c4e83939..64e97a969857 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -80,6 +80,7 @@ configure specific aspects of kernel behavior to your liking. namespaces/index perf-security acpi/index + device-mapper/index .. only:: subproject and html diff --git a/Documentation/device-mapper/cache-policies.rst b/Documentation/device-mapper/cache-policies.rst deleted file mode 100644 index b17fe352fc41..000000000000 --- a/Documentation/device-mapper/cache-policies.rst +++ /dev/null @@ -1,131 +0,0 @@ -============================= -Guidance for writing policies -============================= - -Try to keep transactionality out of it. The core is careful to -avoid asking about anything that is migrating. This is a pain, but -makes it easier to write the policies. - -Mappings are loaded into the policy at construction time. - -Every bio that is mapped by the target is referred to the policy. -The policy can return a simple HIT or MISS or issue a migration. - -Currently there's no way for the policy to issue background work, -e.g. to start writing back dirty blocks that are going to be evicted -soon. - -Because we map bios, rather than requests it's easy for the policy -to get fooled by many small bios. For this reason the core target -issues periodic ticks to the policy. It's suggested that the policy -doesn't update states (eg, hit counts) for a block more than once -for each tick. The core ticks by watching bios complete, and so -trying to see when the io scheduler has let the ios run. - - -Overview of supplied cache replacement policies -=============================================== - -multiqueue (mq) ---------------- - -This policy is now an alias for smq (see below). - -The following tunables are accepted, but have no effect:: - - 'sequential_threshold <#nr_sequential_ios>' - 'random_threshold <#nr_random_ios>' - 'read_promote_adjustment ' - 'write_promote_adjustment ' - 'discard_promote_adjustment ' - -Stochastic multiqueue (smq) ---------------------------- - -This policy is the default. - -The stochastic multi-queue (smq) policy addresses some of the problems -with the multiqueue (mq) policy. - -The smq policy (vs mq) offers the promise of less memory utilization, -improved performance and increased adaptability in the face of changing -workloads. smq also does not have any cumbersome tuning knobs. - -Users may switch from "mq" to "smq" simply by appropriately reloading a -DM table that is using the cache target. Doing so will cause all of the -mq policy's hints to be dropped. Also, performance of the cache may -degrade slightly until smq recalculates the origin device's hotspots -that should be cached. - -Memory usage -^^^^^^^^^^^^ - -The mq policy used a lot of memory; 88 bytes per cache block on a 64 -bit machine. - -smq uses 28bit indexes to implement its data structures rather than -pointers. It avoids storing an explicit hit count for each block. It -has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of -the entries (each hotspot block covers a larger area than a single -cache block). - -All this means smq uses ~25bytes per cache block. Still a lot of -memory, but a substantial improvement nontheless. - -Level balancing -^^^^^^^^^^^^^^^ - -mq placed entries in different levels of the multiqueue structures -based on their hit count (~ln(hit count)). This meant the bottom -levels generally had the most entries, and the top ones had very -few. Having unbalanced levels like this reduced the efficacy of the -multiqueue. - -smq does not maintain a hit count, instead it swaps hit entries with -the least recently used entry from the level above. The overall -ordering being a side effect of this stochastic process. With this -scheme we can decide how many entries occupy each multiqueue level, -resulting in better promotion/demotion decisions. - -Adaptability: -The mq policy maintained a hit count for each cache block. For a -different block to get promoted to the cache its hit count has to -exceed the lowest currently in the cache. This meant it could take a -long time for the cache to adapt between varying IO patterns. - -smq doesn't maintain hit counts, so a lot of this problem just goes -away. In addition it tracks performance of the hotspot queue, which -is used to decide which blocks to promote. If the hotspot queue is -performing badly then it starts moving entries more quickly between -levels. This lets it adapt to new IO patterns very quickly. - -Performance -^^^^^^^^^^^ - -Testing smq shows substantially better performance than mq. - -cleaner -------- - -The cleaner writes back all dirty blocks in a cache to decommission it. - -Examples -======== - -The syntax for a table is:: - - cache - <#feature_args> []* - <#policy_args> []* - -The syntax to send a message using the dmsetup command is:: - - dmsetup message 0 sequential_threshold 1024 - dmsetup message 0 random_threshold 8 - -Using dmsetup:: - - dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ - /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" - creates a 128GB large mapped device named 'blah' with the - sequential threshold set to 1024 and the random_threshold set to 8. diff --git a/Documentation/device-mapper/cache.rst b/Documentation/device-mapper/cache.rst deleted file mode 100644 index f15e5254d05b..000000000000 --- a/Documentation/device-mapper/cache.rst +++ /dev/null @@ -1,337 +0,0 @@ -===== -Cache -===== - -Introduction -============ - -dm-cache is a device mapper target written by Joe Thornber, Heinz -Mauelshagen, and Mike Snitzer. - -It aims to improve performance of a block device (eg, a spindle) by -dynamically migrating some of its data to a faster, smaller device -(eg, an SSD). - -This device-mapper solution allows us to insert this caching at -different levels of the dm stack, for instance above the data device for -a thin-provisioning pool. Caching solutions that are integrated more -closely with the virtual memory system should give better performance. - -The target reuses the metadata library used in the thin-provisioning -library. - -The decision as to what data to migrate and when is left to a plug-in -policy module. Several of these have been written as we experiment, -and we hope other people will contribute others for specific io -scenarios (eg. a vm image server). - -Glossary -======== - - Migration - Movement of the primary copy of a logical block from one - device to the other. - Promotion - Migration from slow device to fast device. - Demotion - Migration from fast device to slow device. - -The origin device always contains a copy of the logical block, which -may be out of date or kept in sync with the copy on the cache device -(depending on policy). - -Design -====== - -Sub-devices ------------ - -The target is constructed by passing three devices to it (along with -other parameters detailed later): - -1. An origin device - the big, slow one. - -2. A cache device - the small, fast one. - -3. A small metadata device - records which blocks are in the cache, - which are dirty, and extra hints for use by the policy object. - This information could be put on the cache device, but having it - separate allows the volume manager to configure it differently, - e.g. as a mirror for extra robustness. This metadata device may only - be used by a single cache device. - -Fixed block size ----------------- - -The origin is divided up into blocks of a fixed size. This block size -is configurable when you first create the cache. Typically we've been -using block sizes of 256KB - 1024KB. The block size must be between 64 -sectors (32KB) and 2097152 sectors (1GB) and a multiple of 64 sectors (32KB). - -Having a fixed block size simplifies the target a lot. But it is -something of a compromise. For instance, a small part of a block may be -getting hit a lot, yet the whole block will be promoted to the cache. -So large block sizes are bad because they waste cache space. And small -block sizes are bad because they increase the amount of metadata (both -in core and on disk). - -Cache operating modes ---------------------- - -The cache has three operating modes: writeback, writethrough and -passthrough. - -If writeback, the default, is selected then a write to a block that is -cached will go only to the cache and the block will be marked dirty in -the metadata. - -If writethrough is selected then a write to a cached block will not -complete until it has hit both the origin and cache devices. Clean -blocks should remain clean. - -If passthrough is selected, useful when the cache contents are not known -to be coherent with the origin device, then all reads are served from -the origin device (all reads miss the cache) and all writes are -forwarded to the origin device; additionally, write hits cause cache -block invalidates. To enable passthrough mode the cache must be clean. -Passthrough mode allows a cache device to be activated without having to -worry about coherency. Coherency that exists is maintained, although -the cache will gradually cool as writes take place. If the coherency of -the cache can later be verified, or established through use of the -"invalidate_cblocks" message, the cache device can be transitioned to -writethrough or writeback mode while still warm. Otherwise, the cache -contents can be discarded prior to transitioning to the desired -operating mode. - -A simple cleaner policy is provided, which will clean (write back) all -dirty blocks in a cache. Useful for decommissioning a cache or when -shrinking a cache. Shrinking the cache's fast device requires all cache -blocks, in the area of the cache being removed, to be clean. If the -area being removed from the cache still contains dirty blocks the resize -will fail. Care must be taken to never reduce the volume used for the -cache's fast device until the cache is clean. This is of particular -importance if writeback mode is used. Writethrough and passthrough -modes already maintain a clean cache. Future support to partially clean -the cache, above a specified threshold, will allow for keeping the cache -warm and in writeback mode during resize. - -Migration throttling --------------------- - -Migrating data between the origin and cache device uses bandwidth. -The user can set a throttle to prevent more than a certain amount of -migration occurring at any one time. Currently we're not taking any -account of normal io traffic going to the devices. More work needs -doing here to avoid migrating during those peak io moments. - -For the time being, a message "migration_threshold <#sectors>" -can be used to set the maximum number of sectors being migrated, -the default being 2048 sectors (1MB). - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the cache behaves like a physical disk that has a volatile write -cache. If power is lost you may lose some recent writes. The metadata -should always be consistent in spite of any crash. - -The 'dirty' state for a cache block changes far too frequently for us -to keep updating it on the fly. So we treat it as a hint. In normal -operation it will be written when the dm device is suspended. If the -system crashes all cache blocks will be assumed dirty when restarted. - -Per-block policy hints ----------------------- - -Policy plug-ins can store a chunk of data per cache block. It's up to -the policy how big this chunk is, but it should be kept small. Like the -dirty flags this data is lost if there's a crash so a safe fallback -value should always be possible. - -Policy hints affect performance, not correctness. - -Policy messaging ----------------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. Refer to cache-policies.txt. - -Discard bitset resolution -------------------------- - -We can avoid copying data during migration if we know the block has -been discarded. A prime example of this is when mkfs discards the -whole block device. We store a bitset tracking the discard state of -blocks. However, we allow this bitset to have a different block size -from the cache blocks. This is because we need to track the discard -state for all of the origin device (compare with the dirty bitset -which is just for the smaller cache device). - -Target interface -================ - -Constructor ------------ - - :: - - cache - <#feature args> []* - <#policy args> [policy args]* - - ================ ======================================================= - metadata dev fast device holding the persistent metadata - cache dev fast device holding cached data blocks - origin dev slow device holding original data blocks - block size cache unit size in sectors - - #feature args number of feature arguments passed - feature args writethrough or passthrough (The default is writeback.) - - policy the replacement policy to use - #policy args an even number of arguments corresponding to - key/value pairs passed to the policy - policy args key/value pairs passed to the policy - E.g. 'sequential_threshold 1024' - See cache-policies.txt for details. - ================ ======================================================= - -Optional feature arguments are: - - - ==================== ======================================================== - writethrough write through caching that prohibits cache block - content from being different from origin block content. - Without this argument, the default behaviour is to write - back cache block contents later for performance reasons, - so they may differ from the corresponding origin blocks. - - passthrough a degraded mode useful for various cache coherency - situations (e.g., rolling back snapshots of - underlying storage). Reads and writes always go to - the origin. If a write goes to a cached origin - block, then the cache block is invalidated. - To enable passthrough mode the cache must be clean. - - metadata2 use version 2 of the metadata. This stores the dirty - bits in a separate btree, which improves speed of - shutting down the cache. - - no_discard_passdown disable passing down discards from the cache - to the origin's data device. - ==================== ======================================================== - -A policy called 'default' is always registered. This is an alias for -the policy we currently think is giving best all round performance. - -As the default policy could vary between kernels, if you are relying on -the characteristics of a specific policy, always request it by name. - -Status ------- - -:: - - <#used metadata blocks>/<#total metadata blocks> - <#used cache blocks>/<#total cache blocks> - <#read hits> <#read misses> <#write hits> <#write misses> - <#demotions> <#promotions> <#dirty> <#features> * - <#core args> * <#policy args> * - - - -========================= ===================================================== -metadata block size Fixed block size for each metadata block in - sectors -#used metadata blocks Number of metadata blocks used -#total metadata blocks Total number of metadata blocks -cache block size Configurable block size for the cache device - in sectors -#used cache blocks Number of blocks resident in the cache -#total cache blocks Total number of cache blocks -#read hits Number of times a READ bio has been mapped - to the cache -#read misses Number of times a READ bio has been mapped - to the origin -#write hits Number of times a WRITE bio has been mapped - to the cache -#write misses Number of times a WRITE bio has been - mapped to the origin -#demotions Number of times a block has been removed - from the cache -#promotions Number of times a block has been moved to - the cache -#dirty Number of blocks in the cache that differ - from the origin -#feature args Number of feature args to follow -feature args 'writethrough' (optional) -#core args Number of core arguments (must be even) -core args Key/value pairs for tuning the core - e.g. migration_threshold -policy name Name of the policy -#policy args Number of policy arguments to follow (must be even) -policy args Key/value pairs e.g. sequential_threshold -cache metadata mode ro if read-only, rw if read-write - - In serious cases where even a read-only mode is - deemed unsafe no further I/O will be permitted and - the status will just contain the string 'Fail'. - The userspace recovery tools should then be used. -needs_check 'needs_check' if set, '-' if not set - A metadata operation has failed, resulting in the - needs_check flag being set in the metadata's - superblock. The metadata device must be - deactivated and checked/repaired before the - cache can be made fully operational again. - '-' indicates needs_check is not set. -========================= ===================================================== - -Messages --------- - -Policies will have different tunables, specific to each one, so we -need a generic way of getting and setting these. Device-mapper -messages are used. (A sysfs interface would also be possible.) - -The message format is:: - - - -E.g.:: - - dmsetup message my_cache 0 sequential_threshold 1024 - - -Invalidation is removing an entry from the cache without writing it -back. Cache blocks can be invalidated via the invalidate_cblocks -message, which takes an arbitrary number of cblock ranges. Each cblock -range's end value is "one past the end", meaning 5-10 expresses a range -of values from 5 to 9. Each cblock must be expressed as a decimal -value, in the future a variant message that takes cblock ranges -expressed in hexadecimal may be needed to better support efficient -invalidation of larger caches. The cache must be in passthrough mode -when invalidate_cblocks is used:: - - invalidate_cblocks [|-]* - -E.g.:: - - dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 - -Examples -======== - -The test suite can be found here: - -https://github.com/jthornber/device-mapper-test-suite - -:: - - dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' - dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ - mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/device-mapper/delay.rst b/Documentation/device-mapper/delay.rst deleted file mode 100644 index 917ba8c33359..000000000000 --- a/Documentation/device-mapper/delay.rst +++ /dev/null @@ -1,31 +0,0 @@ -======== -dm-delay -======== - -Device-Mapper's "delay" target delays reads and/or writes -and maps them to different devices. - -Parameters:: - - [ - [ ]] - -With separate write parameters, the first set is only used for reads. -Offsets are specified in sectors. -Delays are specified in milliseconds. - -Example scripts -=============== - -:: - - #!/bin/sh - # Create device delaying rw operation for 500ms - echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed - -:: - - #!/bin/sh - # Create device delaying only write operation for 500ms and - # splitting reads and writes to different devices $1 $2 - echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed diff --git a/Documentation/device-mapper/dm-crypt.rst b/Documentation/device-mapper/dm-crypt.rst deleted file mode 100644 index 8f4a3f889d43..000000000000 --- a/Documentation/device-mapper/dm-crypt.rst +++ /dev/null @@ -1,173 +0,0 @@ -======== -dm-crypt -======== - -Device-Mapper's "crypt" target provides transparent encryption of block devices -using the kernel crypto API. - -For a more detailed description of supported parameters see: -https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt - -Parameters:: - - \ - [<#opt_params> ] - - - Encryption cipher, encryption mode and Initial Vector (IV) generator. - - The cipher specifications format is:: - - cipher[:keycount]-chainmode-ivmode[:ivopts] - - Examples:: - - aes-cbc-essiv:sha256 - aes-xts-plain64 - serpent-xts-plain64 - - Cipher format also supports direct specification with kernel crypt API - format (selected by capi: prefix). The IV specification is the same - as for the first format type. - This format is mainly used for specification of authenticated modes. - - The crypto API cipher specifications format is:: - - capi:cipher_api_spec-ivmode[:ivopts] - - Examples:: - - capi:cbc(aes)-essiv:sha256 - capi:xts(aes)-plain64 - - Examples of authenticated modes:: - - capi:gcm(aes)-random - capi:authenc(hmac(sha256),xts(aes))-random - capi:rfc7539(chacha20,poly1305)-random - - The /proc/crypto contains a list of curently loaded crypto modes. - - - Key used for encryption. It is encoded either as a hexadecimal number - or it can be passed as prefixed with single colon - character (':') for keys residing in kernel keyring service. - You can only use key sizes that are valid for the selected cipher - in combination with the selected iv mode. - Note that for some iv modes the key string can contain additional - keys (for example IV seed) so the key contains more parts concatenated - into a single string. - - - The kernel keyring key is identified by string in following format: - ::. - - - The encryption key size in bytes. The kernel key payload size must match - the value passed in . - - - Either 'logon' or 'user' kernel key type. - - - The kernel keyring key description crypt target should look for - when loading key of . - - - Multi-key compatibility mode. You can define keys and - then sectors are encrypted according to their offsets (sector 0 uses key0; - sector 1 uses key1 etc.). must be a power of two. - - - The IV offset is a sector count that is added to the sector number - before creating the IV. - - - This is the device that is going to be used as backend and contains the - encrypted data. You can specify it as a path like /dev/xxx or a device - number :. - - - Starting sector within the device where the encrypted data begins. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 3 allow_discards same_cpu_crypt submit_from_crypt_cpus - -allow_discards - Block discard requests (a.k.a. TRIM) are passed through the crypt device. - The default is to ignore discard requests. - - WARNING: Assess the specific security risks carefully before enabling this - option. For example, allowing discards on encrypted devices may lead to - the leak of information about the ciphertext device (filesystem type, - used space etc.) if the discarded blocks can be located easily on the - device later. - -same_cpu_crypt - Perform encryption using the same cpu that IO was submitted on. - The default is to use an unbound workqueue so that encryption work - is automatically balanced between available CPUs. - -submit_from_crypt_cpus - Disable offloading writes to a separate thread after encryption. - There are some situations where offloading write bios from the - encryption threads to a single thread degrades performance - significantly. The default is to offload write bios to the same - thread because it benefits CFQ to have writes submitted using the - same context. - -integrity:: - The device requires additional metadata per-sector stored - in per-bio integrity structure. This metadata must by provided - by underlying dm-integrity target. - - The can be "none" if metadata is used only for persistent IV. - - For Authenticated Encryption with Additional Data (AEAD) - the is "aead". An AEAD mode additionally calculates and verifies - integrity for the encrypted device. The additional space is then - used for storing authentication tag (and persistent IV if needed). - -sector_size: - Use as the encryption unit instead of 512 bytes sectors. - This option can be in range 512 - 4096 bytes and must be power of two. - Virtual device will announce this size as a minimal IO and logical sector. - -iv_large_sectors - IV generators will use sector number counted in units - instead of default 512 bytes sectors. - - For example, if is 4096 bytes, plain64 IV for the second - sector will be 8 (without flag) and 1 if iv_large_sectors is present. - The must be multiple of (in 512 bytes units) - if this flag is specified. - -Example scripts -=============== -LUKS (Linux Unified Key Setup) is now the preferred way to set up disk -encryption with dm-crypt using the 'cryptsetup' utility, see -https://gitlab.com/cryptsetup/cryptsetup - -:: - - #!/bin/sh - # Create a crypt device using dmsetup - dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" - -:: - - #!/bin/sh - # Create a crypt device using dmsetup when encryption key is stored in keyring service - dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" - -:: - - #!/bin/sh - # Create a crypt device using cryptsetup and LUKS header with default cipher - cryptsetup luksFormat $1 - cryptsetup luksOpen $1 crypt1 diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/device-mapper/dm-dust.txt deleted file mode 100644 index 954d402a1f6a..000000000000 --- a/Documentation/device-mapper/dm-dust.txt +++ /dev/null @@ -1,272 +0,0 @@ -dm-dust -======= - -This target emulates the behavior of bad sectors at arbitrary -locations, and the ability to enable the emulation of the failures -at an arbitrary time. - -This target behaves similarly to a linear target. At a given time, -the user can send a message to the target to start failing read -requests on specific blocks (to emulate the behavior of a hard disk -drive with bad sectors). - -When the failure behavior is enabled (i.e.: when the output of -"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks -in the "bad block list" will fail with EIO ("Input/output error"). - -Writes of blocks in the "bad block list will result in the following: - -1. Remove the block from the "bad block list". -2. Successfully complete the write. - -This emulates the "remapped sector" behavior of a drive with bad -sectors. - -Normally, a drive that is encountering bad sectors will most likely -encounter more bad sectors, at an unknown time or location. -With dm-dust, the user can use the "addbadblock" and "removebadblock" -messages to add arbitrary bad blocks at new locations, and the -"enable" and "disable" messages to modulate the state of whether the -configured "bad blocks" will be treated as bad, or bypassed. -This allows the pre-writing of test data and metadata prior to -simulating a "failure" event where bad sectors start to appear. - -Table parameters: ------------------ - - -Mandatory parameters: - : path to the block device. - : offset to data area from start of device_path - : block size in bytes - (minimum 512, maximum 1073741824, must be a power of 2) - -Usage instructions: -------------------- - -First, find the size (in 512-byte sectors) of the device to be used: - -$ sudo blockdev --getsz /dev/vdb1 -33552384 - -Create the dm-dust device: -(For a device with a block size of 512 bytes) -$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512' - -(For a device with a block size of 4096 bytes) -$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096' - -Check the status of the read behavior ("bypass" indicates that all I/O -will be passed through to the underlying device): -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 bypass - -$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct -128+0 records in -128+0 records out - -$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct -128+0 records in -128+0 records out - -Adding and removing bad blocks: -------------------------------- - -At any time (i.e.: whether the device has the "bad block" emulation -enabled or disabled), bad blocks may be added or removed from the -device via the "addbadblock" and "removebadblock" messages: - -$ sudo dmsetup message dust1 0 addbadblock 60 -kernel: device-mapper: dust: badblock added at block 60 - -$ sudo dmsetup message dust1 0 addbadblock 67 -kernel: device-mapper: dust: badblock added at block 67 - -$ sudo dmsetup message dust1 0 addbadblock 72 -kernel: device-mapper: dust: badblock added at block 72 - -These bad blocks will be stored in the "bad block list". -While the device is in "bypass" mode, reads and writes will succeed: - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 bypass - -Enabling block read failures: ------------------------------ - -To enable the "fail read on bad block" behavior, send the "enable" message: - -$ sudo dmsetup message dust1 0 enable -kernel: device-mapper: dust: enabling read failures on bad sectors - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block - -With the device in "fail read on bad block" mode, attempting to read a -block will encounter an "Input/output error": - -$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct -dd: error reading '/dev/mapper/dust1': Input/output error -0+0 records in -0+0 records out -0 bytes copied, 0.00040651 s, 0.0 kB/s - -...and writing to the bad blocks will remove the blocks from the list, -therefore emulating the "remap" behavior of hard disk drives: - -$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct -128+0 records in -128+0 records out - -kernel: device-mapper: dust: block 60 removed from badblocklist by write -kernel: device-mapper: dust: block 67 removed from badblocklist by write -kernel: device-mapper: dust: block 72 removed from badblocklist by write -kernel: device-mapper: dust: block 87 removed from badblocklist by write - -Bad block add/remove error handling: ------------------------------------- - -Attempting to add a bad block that already exists in the list will -result in an "Invalid argument" error, as well as a helpful message: - -$ sudo dmsetup message dust1 0 addbadblock 88 -device-mapper: message ioctl on dust1 failed: Invalid argument -kernel: device-mapper: dust: block 88 already in badblocklist - -Attempting to remove a bad block that doesn't exist in the list will -result in an "Invalid argument" error, as well as a helpful message: - -$ sudo dmsetup message dust1 0 removebadblock 87 -device-mapper: message ioctl on dust1 failed: Invalid argument -kernel: device-mapper: dust: block 87 not found in badblocklist - -Counting the number of bad blocks in the bad block list: --------------------------------------------------------- - -To count the number of bad blocks configured in the device, run the -following message command: - -$ sudo dmsetup message dust1 0 countbadblocks - -A message will print with the number of bad blocks currently -configured on the device: - -kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found - -Querying for specific bad blocks: ---------------------------------- - -To find out if a specific block is in the bad block list, run the -following message command: - -$ sudo dmsetup message dust1 0 queryblock 72 - -The following message will print if the block is in the list: -device-mapper: dust: queryblock: block 72 found in badblocklist - -The following message will print if the block is in the list: -device-mapper: dust: queryblock: block 72 not found in badblocklist - -The "queryblock" message command will work in both the "enabled" -and "disabled" modes, allowing the verification of whether a block -will be treated as "bad" without having to issue I/O to the device, -or having to "enable" the bad block emulation. - -Clearing the bad block list: ----------------------------- - -To clear the bad block list (without needing to individually run -a "removebadblock" message command for every block), run the -following message command: - -$ sudo dmsetup message dust1 0 clearbadblocks - -After clearing the bad block list, the following message will appear: - -kernel: device-mapper: dust: clearbadblocks: badblocks cleared - -If there were no bad blocks to clear, the following message will -appear: - -kernel: device-mapper: dust: clearbadblocks: no badblocks found - -Message commands list: ----------------------- - -Below is a list of the messages that can be sent to a dust device: - -Operations on blocks (requires a argument): - -addbadblock -queryblock -removebadblock - -...where is a block number within range of the device - (corresponding to the block size of the device.) - -Single argument message commands: - -countbadblocks -clearbadblocks -disable -enable -quiet - -Device removal: ---------------- - -When finished, remove the device via the "dmsetup remove" command: - -$ sudo dmsetup remove dust1 - -Quiet mode: ------------ - -On test runs with many bad blocks, it may be desirable to avoid -excessive logging (from bad blocks added, removed, or "remapped"). -This can be done by enabling "quiet mode" via the following message: - -$ sudo dmsetup message dust1 0 quiet - -This will suppress log messages from add / remove / removed by write -operations. Log messages from "countbadblocks" or "queryblock" -message commands will still print in quiet mode. - -The status of quiet mode can be seen by running "dmsetup status": - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block quiet - -To disable quiet mode, send the "quiet" message again: - -$ sudo dmsetup message dust1 0 quiet - -$ sudo dmsetup status dust1 -0 33552384 dust 252:17 fail_read_on_bad_block verbose - -(The presence of "verbose" indicates normal logging.) - -"Why not...?" -------------- - -scsi_debug has a "medium error" mode that can fail reads on one -specified sector (sector 0x1234, hardcoded in the source code), but -it uses RAM for the persistent storage, which drastically decreases -the potential device size. - -dm-flakey fails all I/O from all block locations at a specified time -frequency, and not a given point in time. - -When a bad sector occurs on a hard disk drive, reads to that sector -are failed by the device, usually resulting in an error code of EIO -("I/O error") or ENODATA ("No data available"). However, a write to -the sector may succeed, and result in the sector becoming readable -after the device controller no longer experiences errors reading the -sector (or after a reallocation of the sector). However, there may -be bad sectors that occur on the device in the future, in a different, -unpredictable location. - -This target seeks to provide a device that can exhibit the behavior -of a bad sector at a known sector location, at a known time, based -on a large storage device (at least tens of gigabytes, not occupying -system memory). diff --git a/Documentation/device-mapper/dm-flakey.rst b/Documentation/device-mapper/dm-flakey.rst deleted file mode 100644 index 86138735879d..000000000000 --- a/Documentation/device-mapper/dm-flakey.rst +++ /dev/null @@ -1,74 +0,0 @@ -========= -dm-flakey -========= - -This target is the same as the linear target except that it exhibits -unreliable behaviour periodically. It's been found useful in simulating -failing devices for testing purposes. - -Starting from the time the table is loaded, the device is available for - seconds, then exhibits unreliable behaviour for seconds, and then this cycle repeats. - -Also, consider using this in combination with the dm-delay target too, -which can delay reads and writes and/or send them to different -underlying devices. - -Table parameters ----------------- - -:: - - \ - [ []] - -Mandatory parameters: - - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - : - Number of seconds device is available. - : - Number of seconds device returns errors. - -Optional feature parameters: - - If no feature parameters are present, during the periods of - unreliability, all I/O returns errors. - - drop_writes: - All write I/O is silently ignored. - Read I/O is handled correctly. - - error_writes: - All write I/O is failed with an error signalled. - Read I/O is handled correctly. - - corrupt_bio_byte : - During , replace of the data of - each matching bio with . - - : - The offset of the byte to replace. - Counting starts at 1, to replace the first byte. - : - Either 'r' to corrupt reads or 'w' to corrupt writes. - 'w' is incompatible with drop_writes. - : - The value (from 0-255) to write. - : - Perform the replacement only if bio->bi_opf has all the - selected flags set. - -Examples: - -Replaces the 32nd byte of READ bios with the value 1:: - - corrupt_bio_byte 32 r 1 0 - -Replaces the 224th byte of REQ_META (=32) bios with the value 0:: - - corrupt_bio_byte 224 w 0 32 diff --git a/Documentation/device-mapper/dm-init.rst b/Documentation/device-mapper/dm-init.rst deleted file mode 100644 index e5242ff17e9b..000000000000 --- a/Documentation/device-mapper/dm-init.rst +++ /dev/null @@ -1,125 +0,0 @@ -================================ -Early creation of mapped devices -================================ - -It is possible to configure a device-mapper device to act as the root device for -your system in two ways. - -The first is to build an initial ramdisk which boots to a minimal userspace -which configures the device, then pivot_root(8) in to it. - -The second is to create one or more device-mappers using the module parameter -"dm-mod.create=" through the kernel boot command line argument. - -The format is specified as a string of data separated by commas and optionally -semi-colons, where: - - - a comma is used to separate fields like name, uuid, flags and table - (specifies one device) - - a semi-colon is used to separate devices. - -So the format will look like this:: - - dm-mod.create=,,,,
[,
+][;,,,,
[,
+]+] - -Where:: - - ::= The device name. - ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" - ::= The device minor number | "" - ::= "ro" | "rw" -
::= - ::= "verity" | "linear" | ... (see list below) - -The dm line should be equivalent to the one used by the dmsetup tool with the -`--concise` argument. - -Target types -============ - -Not all target types are available as there are serious risks in allowing -activation of certain DM targets without first using userspace tools to check -the validity of associated metadata. - -======================= ======================================================= -`cache` constrained, userspace should verify cache device -`crypt` allowed -`delay` allowed -`era` constrained, userspace should verify metadata device -`flakey` constrained, meant for test -`linear` allowed -`log-writes` constrained, userspace should verify metadata device -`mirror` constrained, userspace should verify main/mirror device -`raid` constrained, userspace should verify metadata device -`snapshot` constrained, userspace should verify src/dst device -`snapshot-origin` allowed -`snapshot-merge` constrained, userspace should verify src/dst device -`striped` allowed -`switch` constrained, userspace should verify dev path -`thin` constrained, requires dm target message from userspace -`thin-pool` constrained, requires dm target message from userspace -`verity` allowed -`writecache` constrained, userspace should verify cache device -`zero` constrained, not meant for rootfs -======================= ======================================================= - -If the target is not listed above, it is constrained by default (not tested). - -Examples -======== -An example of booting to a linear array made up of user-mode linux block -devices:: - - dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 - -This will boot to a rw dm-linear target of 8192 sectors split across two block -devices identified by their major:minor numbers. After boot, udev will rename -this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. - -An example of multiple device-mappers, with the dm-mod.create="..." contents -is shown here split on multiple lines for readability:: - - dm-linear,,1,rw, - 0 32768 linear 8:1 0, - 32768 1024000 linear 8:2 0; - dm-verity,,3,ro, - 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 - ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 - 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 - -Other examples (per target): - -"crypt":: - - dm-crypt,,8,ro, - 0 1048576 crypt aes-xts-plain64 - babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 - /dev/sda 0 1 allow_discards - -"delay":: - - dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 - -"linear":: - - dm-linear,,,rw, - 0 32768 linear /dev/sda1 0, - 32768 1024000 linear /dev/sda2 0, - 1056768 204800 linear /dev/sda3 0, - 1261568 512000 linear /dev/sda4 0 - -"snapshot-origin":: - - dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 - -"striped":: - - dm-striped,,4,ro,0 1638400 striped 4 4096 - /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 - -"verity":: - - dm-verity,,4,ro, - 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 - fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd - 51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584 diff --git a/Documentation/device-mapper/dm-integrity.rst b/Documentation/device-mapper/dm-integrity.rst deleted file mode 100644 index a30aa91b5fbe..000000000000 --- a/Documentation/device-mapper/dm-integrity.rst +++ /dev/null @@ -1,259 +0,0 @@ -============ -dm-integrity -============ - -The dm-integrity target emulates a block device that has additional -per-sector tags that can be used for storing integrity information. - -A general problem with storing integrity tags with every sector is that -writing the sector and the integrity tag must be atomic - i.e. in case of -crash, either both sector and integrity tag or none of them is written. - -To guarantee write atomicity, the dm-integrity target uses journal, it -writes sector data and integrity tags into a journal, commits the journal -and then copies the data and integrity tags to their respective location. - -The dm-integrity target can be used with the dm-crypt target - in this -situation the dm-crypt target creates the integrity data and passes them -to the dm-integrity target via bio_integrity_payload attached to the bio. -In this mode, the dm-crypt and dm-integrity targets provide authenticated -disk encryption - if the attacker modifies the encrypted device, an I/O -error is returned instead of random data. - -The dm-integrity target can also be used as a standalone target, in this -mode it calculates and verifies the integrity tag internally. In this -mode, the dm-integrity target can be used to detect silent data -corruption on the disk or in the I/O path. - -There's an alternate mode of operation where dm-integrity uses bitmap -instead of a journal. If a bit in the bitmap is 1, the corresponding -region's data and integrity tags are not synchronized - if the machine -crashes, the unsynchronized regions will be recalculated. The bitmap mode -is faster than the journal mode, because we don't have to write the data -twice, but it is also less reliable, because if data corruption happens -when the machine crashes, it may not be detected. - -When loading the target for the first time, the kernel driver will format -the device. But it will only format the device if the superblock contains -zeroes. If the superblock is neither valid nor zeroed, the dm-integrity -target can't be loaded. - -To use the target for the first time: - -1. overwrite the superblock with zeroes -2. load the dm-integrity target with one-sector size, the kernel driver - will format the device -3. unload the dm-integrity target -4. read the "provided_data_sectors" value from the superblock -5. load the dm-integrity target with the the target size - "provided_data_sectors" -6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target - with the size "provided_data_sectors" - - -Target arguments: - -1. the underlying block device - -2. the number of reserved sector at the beginning of the device - the - dm-integrity won't read of write these sectors - -3. the size of the integrity tag (if "-" is used, the size is taken from - the internal-hash algorithm) - -4. mode: - - D - direct writes (without journal) - in this mode, journaling is - not used and data sectors and integrity tags are written - separately. In case of crash, it is possible that the data - and integrity tag doesn't match. - J - journaled writes - data and integrity tags are written to the - journal and atomicity is guaranteed. In case of crash, - either both data and tag or none of them are written. The - journaled mode degrades write throughput twice because the - data have to be written twice. - B - bitmap mode - data and metadata are written without any - synchronization, the driver maintains a bitmap of dirty - regions where data and metadata don't match. This mode can - only be used with internal hash. - R - recovery mode - in this mode, journal is not replayed, - checksums are not checked and writes to the device are not - allowed. This mode is useful for data recovery if the - device cannot be activated in any of the other standard - modes. - -5. the number of additional arguments - -Additional arguments: - -journal_sectors:number - The size of journal, this argument is used only if formatting the - device. If the device is already formatted, the value from the - superblock is used. - -interleave_sectors:number - The number of interleaved sectors. This values is rounded down to - a power of two. If the device is already formatted, the value from - the superblock is used. - -meta_device:device - Don't interleave the data and metadata on on device. Use a - separate device for metadata. - -buffer_sectors:number - The number of sectors in one buffer. The value is rounded down to - a power of two. - - The tag area is accessed using buffers, the buffer size is - configurable. The large buffer size means that the I/O size will - be larger, but there could be less I/Os issued. - -journal_watermark:number - The journal watermark in percents. When the size of the journal - exceeds this watermark, the thread that flushes the journal will - be started. - -commit_time:number - Commit time in milliseconds. When this time passes, the journal is - written. The journal is also written immediatelly if the FLUSH - request is received. - -internal_hash:algorithm(:key) (the key is optional) - Use internal hash or crc. - When this argument is used, the dm-integrity target won't accept - integrity tags from the upper target, but it will automatically - generate and verify the integrity tags. - - You can use a crc algorithm (such as crc32), then integrity target - will protect the data against accidental corruption. - You can also use a hmac algorithm (for example - "hmac(sha256):0123456789abcdef"), in this mode it will provide - cryptographic authentication of the data without encryption. - - When this argument is not used, the integrity tags are accepted - from an upper layer target, such as dm-crypt. The upper layer - target should check the validity of the integrity tags. - -recalculate - Recalculate the integrity tags automatically. It is only valid - when using internal hash. - -journal_crypt:algorithm(:key) (the key is optional) - Encrypt the journal using given algorithm to make sure that the - attacker can't read the journal. You can use a block cipher here - (such as "cbc(aes)") or a stream cipher (for example "chacha20", - "salsa20", "ctr(aes)" or "ecb(arc4)"). - - The journal contains history of last writes to the block device, - an attacker reading the journal could see the last sector nubmers - that were written. From the sector numbers, the attacker can infer - the size of files that were written. To protect against this - situation, you can encrypt the journal. - -journal_mac:algorithm(:key) (the key is optional) - Protect sector numbers in the journal from accidental or malicious - modification. To protect against accidental modification, use a - crc algorithm, to protect against malicious modification, use a - hmac algorithm with a key. - - This option is not needed when using internal-hash because in this - mode, the integrity of journal entries is checked when replaying - the journal. Thus, modified sector number would be detected at - this stage. - -block_size:number - The size of a data block in bytes. The larger the block size the - less overhead there is for per-block integrity metadata. - Supported values are 512, 1024, 2048 and 4096 bytes. If not - specified the default block size is 512 bytes. - -sectors_per_bit:number - In the bitmap mode, this parameter specifies the number of - 512-byte sectors that corresponds to one bitmap bit. - -bitmap_flush_interval:number - The bitmap flush interval in milliseconds. The metadata buffers - are synchronized when this interval expires. - - -The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can -be changed when reloading the target (load an inactive table and swap the -tables with suspend and resume). The other arguments should not be changed -when reloading the target because the layout of disk data depend on them -and the reloaded target would be non-functional. - - -The layout of the formatted block device: - -* reserved sectors - (they are not used by this target, they can be used for - storing LUKS metadata or for other purpose), the size of the reserved - area is specified in the target arguments - -* superblock (4kiB) - * magic string - identifies that the device was formatted - * version - * log2(interleave sectors) - * integrity tag size - * the number of journal sections - * provided data sectors - the number of sectors that this target - provides (i.e. the size of the device minus the size of all - metadata and padding). The user of this target should not send - bios that access data beyond the "provided data sectors" limit. - * flags - SB_FLAG_HAVE_JOURNAL_MAC - - a flag is set if journal_mac is used - SB_FLAG_RECALCULATING - - recalculating is in progress - SB_FLAG_DIRTY_BITMAP - - journal area contains the bitmap of dirty - blocks - * log2(sectors per block) - * a position where recalculating finished -* journal - The journal is divided into sections, each section contains: - - * metadata area (4kiB), it contains journal entries - - - every journal entry contains: - - * logical sector (specifies where the data and tag should - be written) - * last 8 bytes of data - * integrity tag (the size is specified in the superblock) - - - every metadata sector ends with - - * mac (8-bytes), all the macs in 8 metadata sectors form a - 64-byte value. It is used to store hmac of sector - numbers in the journal section, to protect against a - possibility that the attacker tampers with sector - numbers in the journal. - * commit id - - * data area (the size is variable; it depends on how many journal - entries fit into the metadata area) - - - every sector in the data area contains: - - * data (504 bytes of data, the last 8 bytes are stored in - the journal entry) - * commit id - - To test if the whole journal section was written correctly, every - 512-byte sector of the journal ends with 8-byte commit id. If the - commit id matches on all sectors in a journal section, then it is - assumed that the section was written correctly. If the commit id - doesn't match, the section was written partially and it should not - be replayed. - -* one or more runs of interleaved tags and data. - Each run contains: - - * tag area - it contains integrity tags. There is one tag for each - sector in the data area - * data area - it contains data sectors. The number of data sectors - in one run must be a power of two. log2 of this value is stored - in the superblock. diff --git a/Documentation/device-mapper/dm-io.rst b/Documentation/device-mapper/dm-io.rst deleted file mode 100644 index d2492917a1f5..000000000000 --- a/Documentation/device-mapper/dm-io.rst +++ /dev/null @@ -1,75 +0,0 @@ -===== -dm-io -===== - -Dm-io provides synchronous and asynchronous I/O services. There are three -types of I/O services available, and each type has a sync and an async -version. - -The user must set up an io_region structure to describe the desired location -of the I/O. Each io_region indicates a block-device along with the starting -sector and size of the region:: - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -Dm-io can read from one io_region or write to one or more io_regions. Writes -to multiple regions are specified by an array of io_region structures. - -The first I/O service type takes a list of memory pages as the data buffer for -the I/O, along with an offset into the first page:: - - struct page_list { - struct page_list *next; - struct page *page; - }; - - int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - unsigned long *error_bits); - int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - io_notify_fn fn, void *context); - -The second I/O service type takes an array of bio vectors as the data buffer -for the I/O. This service can be handy if the caller has a pre-assembled bio, -but wants to direct different portions of the bio to different devices:: - - int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - unsigned long *error_bits); - int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, - int rw, struct bio_vec *bvec, - io_notify_fn fn, void *context); - -The third I/O service type takes a pointer to a vmalloc'd memory buffer as the -data buffer for the I/O. This service can be handy if the caller needs to do -I/O to a large region but doesn't want to allocate a large number of individual -memory pages:: - - int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, unsigned long *error_bits); - int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, io_notify_fn fn, void *context); - -Callers of the asynchronous I/O services must include the name of a completion -callback routine and a pointer to some context data for the I/O:: - - typedef void (*io_notify_fn)(unsigned long error, void *context); - -The "error" parameter in this callback, as well as the `*error` parameter in -all of the synchronous versions, is a bitset (instead of a simple error value). -In the case of an write-I/O to multiple regions, this bitset allows dm-io to -indicate success or failure on each individual region. - -Before using any of the dm-io services, the user should call dm_io_get() -and specify the number of pages they expect to perform I/O on concurrently. -Dm-io will attempt to resize its mempool to make sure enough pages are -always available in order to avoid unnecessary waiting while performing I/O. - -When the user is finished using the dm-io services, they should call -dm_io_put() and specify the same number of pages that were given on the -dm_io_get() call. diff --git a/Documentation/device-mapper/dm-log.rst b/Documentation/device-mapper/dm-log.rst deleted file mode 100644 index ba4fce39bc27..000000000000 --- a/Documentation/device-mapper/dm-log.rst +++ /dev/null @@ -1,57 +0,0 @@ -===================== -Device-Mapper Logging -===================== -The device-mapper logging code is used by some of the device-mapper -RAID targets to track regions of the disk that are not consistent. -A region (or portion of the address space) of the disk may be -inconsistent because a RAID stripe is currently being operated on or -a machine died while the region was being altered. In the case of -mirrors, a region would be considered dirty/inconsistent while you -are writing to it because the writes need to be replicated for all -the legs of the mirror and may not reach the legs at the same time. -Once all writes are complete, the region is considered clean again. - -There is a generic logging interface that the device-mapper RAID -implementations use to perform logging operations (see -dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different -logging implementations are available and provide different -capabilities. The list includes: - -============== ============================================================== -Type Files -============== ============================================================== -disk drivers/md/dm-log.c -core drivers/md/dm-log.c -userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h -============== ============================================================== - -The "disk" log type -------------------- -This log implementation commits the log state to disk. This way, the -logging state survives reboots/crashes. - -The "core" log type -------------------- -This log implementation keeps the log state in memory. The log state -will not survive a reboot or crash, but there may be a small boost in -performance. This method can also be used if no storage device is -available for storing log state. - -The "userspace" log type ------------------------- -This log type simply provides a way to export the log API to userspace, -so log implementations can be done there. This is done by forwarding most -logging requests to userspace, where a daemon receives and processes the -request. - -The structure used for communication between kernel and userspace are -located in include/linux/dm-log-userspace.h. Due to the frequency, -diversity, and 2-way communication nature of the exchanges between -kernel and userspace, 'connector' is used as the interface for -communication. - -There are currently two userspace log implementations that leverage this -framework - "clustered-disk" and "clustered-core". These implementations -provide a cluster-coherent log for shared-storage. Device-mapper mirroring -can be used in a shared-storage environment when the cluster log implementations -are employed. diff --git a/Documentation/device-mapper/dm-queue-length.rst b/Documentation/device-mapper/dm-queue-length.rst deleted file mode 100644 index d8e381c1cb02..000000000000 --- a/Documentation/device-mapper/dm-queue-length.rst +++ /dev/null @@ -1,48 +0,0 @@ -=============== -dm-queue-length -=============== - -dm-queue-length is a path selector module for device-mapper targets, -which selects a path with the least number of in-flight I/Os. -The path selector name is 'queue-length'. - -Table parameters for each path: [] - -:: - - : The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - -Status for each path: - -:: - - : 'A' if the path is active, 'F' if the path is failed. - : The number of path failures. - : The number of in-flight I/Os on the path. - - -Algorithm -========= - -dm-queue-length increments/decrements 'in-flight' when an I/O is -dispatched/completed respectively. -dm-queue-length selects a path with the minimum 'in-flight'. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128. - -:: - - # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/device-mapper/dm-raid.rst b/Documentation/device-mapper/dm-raid.rst deleted file mode 100644 index 2fe255b130fb..000000000000 --- a/Documentation/device-mapper/dm-raid.rst +++ /dev/null @@ -1,419 +0,0 @@ -======= -dm-raid -======= - -The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. -It allows the MD RAID drivers to be accessed using a device-mapper -interface. - - -Mapping Table Interface ------------------------ -The target is named "raid" and it accepts the following parameters:: - - <#raid_params> \ - <#raid_devs> [.. ] - -: - - ============= =============================================================== - raid0 RAID0 striping (no resilience) - raid1 RAID1 mirroring - raid4 RAID4 with dedicated last parity disk - raid5_n RAID5 with dedicated last parity disk supporting takeover - Same as raid4 - - - Transitory layout - raid5_la RAID5 left asymmetric - - - rotating parity 0 with data continuation - raid5_ra RAID5 right asymmetric - - - rotating parity N with data continuation - raid5_ls RAID5 left symmetric - - - rotating parity 0 with data restart - raid5_rs RAID5 right symmetric - - - rotating parity N with data restart - raid6_zr RAID6 zero restart - - - rotating parity zero (left-to-right) with data restart - raid6_nr RAID6 N restart - - - rotating parity N (right-to-left) with data restart - raid6_nc RAID6 N continue - - - rotating parity N (right-to-left) with data continuation - raid6_n_6 RAID6 with dedicate parity disks - - - parity and Q-syndrome on the last 2 disks; - layout for takeover from/to raid4/raid5_n - raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk - - - layout for takeover from raid5_la from/to raid6 - raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk - - - layout for takeover from raid5_ra from/to raid6 - raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk - - - layout for takeover from raid5_ls from/to raid6 - raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk - - - layout for takeover from raid5_rs from/to raid6 - raid10 Various RAID10 inspired algorithms chosen by additional params - (see raid10_format and raid10_copies below) - - - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - - RAID1E: Integrated Adjacent Stripe Mirroring - - RAID1E: Integrated Offset Stripe Mirroring - - and other similar RAID10 variants - ============= =============================================================== - - Reference: Chapter 4 of - http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf - -<#raid_params>: The number of parameters that follow. - - consists of - - Mandatory parameters: - : - Chunk size in sectors. This parameter is often known as - "stripe size". It is the only mandatory parameter and - is placed first. - - followed by optional parameters (in any order): - [sync|nosync] - Force or prevent RAID initialization. - - [rebuild ] - Rebuild drive number 'idx' (first drive is 0). - - [daemon_sleep ] - Interval between runs of the bitmap daemon that - clear bits. A longer interval means less bitmap I/O but - resyncing after a failure is likely to take longer. - - [min_recovery_rate ] - Throttle RAID initialization - [max_recovery_rate ] - Throttle RAID initialization - [write_mostly ] - Mark drive index 'idx' write-mostly. - [max_write_behind ] - See '--write-behind=' (man mdadm) - [stripe_cache ] - Stripe cache size (RAID 4/5/6 only) - [region_size ] - The region_size multiplied by the number of regions is the - logical size of the array. The bitmap records the device - synchronisation state for each region. - - [raid10_copies <# copies>], [raid10_format ] - These two options are used to alter the default layout of - a RAID10 configuration. The number of copies is can be - specified, but the default is 2. There are also three - variations to how the copies are laid down - the default - is "near". Near copies are what most people think of with - respect to mirroring. If these options are left unspecified, - or 'raid10_copies 2' and/or 'raid10_format near' are given, - then the layouts for 2, 3 and 4 devices are: - - ======== ========== ============== - 2 drives 3 drives 4 drives - ======== ========== ============== - A1 A1 A1 A1 A2 A1 A1 A2 A2 - A2 A2 A2 A3 A3 A3 A3 A4 A4 - A3 A3 A4 A4 A5 A5 A5 A6 A6 - A4 A4 A5 A6 A6 A7 A7 A8 A8 - .. .. .. .. .. .. .. .. .. - ======== ========== ============== - - The 2-device layout is equivalent 2-way RAID1. The 4-device - layout is what a traditional RAID10 would look like. The - 3-device layout is what might be called a 'RAID1E - Integrated - Adjacent Stripe Mirroring'. - - If 'raid10_copies 2' and 'raid10_format far', then the layouts - for 2, 3 and 4 devices are: - - ======== ============ =================== - 2 drives 3 drives 4 drives - ======== ============ =================== - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - .. .. .. .. .. .. .. .. .. - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - ======== ============ =================== - - If 'raid10_copies 2' and 'raid10_format offset', then the - layouts for 2, 3 and 4 devices are: - - ======== ========== ================ - 2 drives 3 drives 4 drives - ======== ========== ================ - A1 A2 A1 A2 A3 A1 A2 A3 A4 - A2 A1 A3 A1 A2 A2 A1 A4 A3 - A3 A4 A4 A5 A6 A5 A6 A7 A8 - A4 A3 A6 A4 A5 A6 A5 A8 A7 - A5 A6 A7 A8 A9 A9 A10 A11 A12 - A6 A5 A9 A7 A8 A10 A9 A12 A11 - .. .. .. .. .. .. .. .. .. - ======== ========== ================ - - Here we see layouts closely akin to 'RAID1E - Integrated - Offset Stripe Mirroring'. - - [delta_disks ] - The delta_disks option value (-251 < N < +251) triggers - device removal (negative value) or device addition (positive - value) to any reshape supporting raid levels 4/5/6 and 10. - RAID levels 4/5/6 allow for addition of devices (metadata - and data device tuple), raid10_near and raid10_offset only - allow for device addition. raid10_far does not support any - reshaping at all. - A minimum of devices have to be kept to enforce resilience, - which is 3 devices for raid4/5 and 4 devices for raid6. - - [data_offset ] - This option value defines the offset into each data device - where the data starts. This is used to provide out-of-place - reshaping space to avoid writing over data while - changing the layout of stripes, hence an interruption/crash - may happen at any time without the risk of losing data. - E.g. when adding devices to an existing raid set during - forward reshaping, the out-of-place space will be allocated - at the beginning of each raid device. The kernel raid4/5/6/10 - MD personalities supporting such device addition will read the data from - the existing first stripes (those with smaller number of stripes) - starting at data_offset to fill up a new stripe with the larger - number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) - and write that new stripe to offset 0. Same will be applied to all - N-1 other new stripes. This out-of-place scheme is used to change - the RAID type (i.e. the allocation algorithm) as well, e.g. - changing from raid5_ls to raid5_n. - - [journal_dev ] - This option adds a journal device to raid4/5/6 raid sets and - uses it to close the 'write hole' caused by the non-atomic updates - to the component devices which can cause data loss during recovery. - The journal device is used as writethrough thus causing writes to - be throttled versus non-journaled raid4/5/6 sets. - Takeover/reshape is not possible with a raid4/5/6 journal device; - it has to be deconfigured before requesting these. - - [journal_mode ] - This option sets the caching mode on journaled raid4/5/6 raid sets - (see 'journal_dev ' above) to 'writethrough' or 'writeback'. - If 'writeback' is selected the journal device has to be resilient - and must not suffer from the 'write hole' problem itself (e.g. use - raid1 or raid10) to avoid a single point of failure. - -<#raid_devs>: The number of devices composing the array. - Each device consists of two entries. The first is the device - containing the metadata (if any); the second is the one containing the - data. A Maximum of 64 metadata/data device entries are supported - up to target version 1.8.0. - 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. - - If a drive has failed or is missing at creation time, a '-' can be - given for both the metadata and data drives for a given position. - - -Example Tables --------------- - -:: - - # RAID4 - 4 data drives, 1 parity (no metadata devices) - # No metadata devices specified to hold superblock/bitmap info - # Chunk size of 1MiB - # (Lines separated for easy reading) - - 0 1960893648 raid \ - raid4 1 2048 \ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 - - # RAID4 - 4 data drives, 1 parity (with metadata devices) - # Chunk size of 1MiB, force RAID initialization, - # min recovery rate at 20 kiB/sec/disk - - 0 1960893648 raid \ - raid4 4 2048 sync min_recovery_rate 20 \ - 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 - - -Status Output -------------- -'dmsetup table' displays the table used to construct the mapping. -The optional parameters are always printed in the order listed -above with "sync" or "nosync" always output ahead of the other -arguments, regardless of the order used when originally loading the table. -Arguments that can be repeated are ordered by value. - - -'dmsetup status' yields information on the state and health of the array. -The output is as follows (normally a single line, but expanded here for -clarity):: - - 1: raid \ - 2: <#devices> \ - 3: - -Line 1 is the standard output produced by device-mapper. - -Line 2 & 3 are produced by the raid target and are best explained by example:: - - 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 - -Here we can see the RAID type is raid4, there are 5 devices - all of -which are 'A'live, and the array is 2/490221568 complete with its initial -recovery. Here is a fuller description of the individual fields: - - =============== ========================================================= - Same as the used to create the array. - One char for each device, indicating: - - - 'A' = alive and in-sync - - 'a' = alive but not in-sync - - 'D' = dead/failed. - The ratio indicating how much of the array has undergone - the process described by 'sync_action'. If the - 'sync_action' is "check" or "repair", then the process - of "resync" or "recover" can be considered complete. - One of the following possible states: - - idle - - No synchronization action is being performed. - frozen - - The current action has been halted. - resync - - Array is undergoing its initial synchronization - or is resynchronizing after an unclean shutdown - (possibly aided by a bitmap). - recover - - A device in the array is being rebuilt or - replaced. - check - - A user-initiated full check of the array is - being performed. All blocks are read and - checked for consistency. The number of - discrepancies found are recorded in - . No changes are made to the - array by this action. - repair - - The same as "check", but discrepancies are - corrected. - reshape - - The array is undergoing a reshape. - The number of discrepancies found between mirror copies - in RAID1/10 or wrong parity values found in RAID4/5/6. - This value is valid only after a "check" of the array - is performed. A healthy array has a 'mismatch_cnt' of 0. - The current data offset to the start of the user data on - each component device of a raid set (see the respective - raid parameter to support out-of-place reshaping). - - 'A' - active write-through journal device. - - 'a' - active write-back journal device. - - 'D' - dead journal device. - - '-' - no journal device. - =============== ========================================================= - - -Message Interface ------------------ -The dm-raid target will accept certain actions through the 'message' interface. -('man dmsetup' for more information on the message interface.) These actions -include: - - ========= ================================================ - "idle" Halt the current sync action. - "frozen" Freeze the current sync action. - "resync" Initiate/continue a resync. - "recover" Initiate/continue a recover process. - "check" Initiate a check (i.e. a "scrub") of the array. - "repair" Initiate a repair of the array. - ========= ================================================ - - -Discard Support ---------------- -The implementation of discard support among hardware vendors varies. -When a block is discarded, some storage devices will return zeroes when -the block is read. These devices set the 'discard_zeroes_data' -attribute. Other devices will return random data. Confusingly, some -devices that advertise 'discard_zeroes_data' will not reliably return -zeroes when discarded blocks are read! Since RAID 4/5/6 uses blocks -from a number of devices to calculate parity blocks and (for performance -reasons) relies on 'discard_zeroes_data' being reliable, it is important -that the devices be consistent. Blocks may be discarded in the middle -of a RAID 4/5/6 stripe and if subsequent read results are not -consistent, the parity blocks may be calculated differently at any time; -making the parity blocks useless for redundancy. It is important to -understand how your hardware behaves with discards if you are going to -enable discards with RAID 4/5/6. - -Since the behavior of storage devices is unreliable in this respect, -even when reporting 'discard_zeroes_data', by default RAID 4/5/6 -discard support is disabled -- this ensures data integrity at the -expense of losing some performance. - -Storage devices that properly support 'discard_zeroes_data' are -increasingly whitelisted in the kernel and can thus be trusted. - -For trusted devices, the following dm-raid module parameter can be set -to safely enable discard support for RAID 4/5/6: - - 'devices_handle_discards_safely' - - -Version History ---------------- - -:: - - 1.0.0 Initial version. Support for RAID 4/5/6 - 1.1.0 Added support for RAID 1 - 1.2.0 Handle creation of arrays that contain failed devices. - 1.3.0 Added support for RAID 10 - 1.3.1 Allow device replacement/rebuild for RAID 10 - 1.3.2 Fix/improve redundancy checking for RAID10 - 1.4.0 Non-functional change. Removes arg from mapping function. - 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). - 1.4.2 Add RAID10 "far" and "offset" algorithm support. - 1.5.0 Add message interface to allow manipulation of the sync_action. - New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. - 1.5.1 Add ability to restore transiently failed devices on resume. - 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". - 1.6.0 Add discard support (and devices_handle_discard_safely module param). - 1.7.0 Add support for MD RAID0 mappings. - 1.8.0 Explicitly check for compatible flags in the superblock metadata - and reject to start the raid set if any are set by a newer - target version, thus avoiding data corruption on a raid set - with a reshape in progress. - 1.9.0 Add support for RAID level takeover/reshape/region size - and set size reduction. - 1.9.1 Fix activation of existing RAID 4/10 mapped devices - 1.9.2 Don't emit '- -' on the status table line in case the constructor - fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and - 'D' on the status line. If '- -' is passed into the constructor, emit - '- -' on the table line and '-' as the status line health character. - 1.10.0 Add support for raid4/5/6 journal device - 1.10.1 Fix data corruption on reshape request - 1.11.0 Fix table line argument order - (wrong raid10_copies/raid10_format sequence) - 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option - 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available - 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') - 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an - state races. - 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen - 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape - deadlock/potential data corruption. Update superblock when - specific devices are requested via rebuild. Fix RAID leg - rebuild errors. diff --git a/Documentation/device-mapper/dm-service-time.rst b/Documentation/device-mapper/dm-service-time.rst deleted file mode 100644 index facf277fc13c..000000000000 --- a/Documentation/device-mapper/dm-service-time.rst +++ /dev/null @@ -1,101 +0,0 @@ -=============== -dm-service-time -=============== - -dm-service-time is a path selector module for device-mapper targets, -which selects a path with the shortest estimated service time for -the incoming I/O. - -The service time for each path is estimated by dividing the total size -of in-flight I/Os on a path with the performance value of the path. -The performance value is a relative throughput value among all paths -in a path-group, and it can be specified as a table argument. - -The path selector name is 'service-time'. - -Table parameters for each path: - - [ []] - : - The number of I/Os to dispatch using the selected - path before switching to the next path. - If not given, internal default is used. To check - the default value, see the activated table. - : - The relative throughput value of the path - among all paths in the path-group. - The valid range is 0-100. - If not given, minimum value '1' is used. - If '0' is given, the path isn't selected while - other paths having a positive value are available. - -Status for each path: - - - : - 'A' if the path is active, 'F' if the path is failed. - : - The number of path failures. - : - The size of in-flight I/Os on the path. - : - The relative throughput value of the path - among all paths in the path-group. - - -Algorithm -========= - -dm-service-time adds the I/O size to 'in-flight-size' when the I/O is -dispatched and subtracts when completed. -Basically, dm-service-time selects a path having minimum service time -which is calculated by:: - - ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' - -However, some optimizations below are used to reduce the calculation -as much as possible. - - 1. If the paths have the same 'relative_throughput', skip - the division and just compare the 'in-flight-size'. - - 2. If the paths have the same 'in-flight-size', skip the division - and just compare the 'relative_throughput'. - - 3. If some paths have non-zero 'relative_throughput' and others - have zero 'relative_throughput', ignore those paths with zero - 'relative_throughput'. - -If such optimizations can't be applied, calculate service time, and -compare service time. -If calculated service time is equal, the path having maximum -'relative_throughput' may be better. So compare 'relative_throughput' -then. - - -Examples -======== -In case that 2 paths (sda and sdb) are used with repeat_count == 128 -and sda has an average throughput 1GB/s and sdb has 4GB/s, -'relative_throughput' value may be '1' for sda and '4' for sdb:: - - # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 - - -Or '2' for sda and '8' for sdb would be also true:: - - # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ - dmsetup create test - # - # dmsetup table - test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 - # - # dmsetup status - test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/device-mapper/dm-uevent.rst b/Documentation/device-mapper/dm-uevent.rst deleted file mode 100644 index 4a8ee8d069c9..000000000000 --- a/Documentation/device-mapper/dm-uevent.rst +++ /dev/null @@ -1,110 +0,0 @@ -==================== -device-mapper uevent -==================== - -The device-mapper uevent code adds the capability to device-mapper to create -and send kobject uevents (uevents). Previously device-mapper events were only -available through the ioctl interface. The advantage of the uevents interface -is the event contains environment attributes providing increased context for -the event avoiding the need to query the state of the device-mapper device after -the event is received. - -There are two functions currently for device-mapper events. The first function -listed creates the event and the second function sends the event(s):: - - void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, - const char *path, unsigned nr_valid_paths) - - void dm_send_uevents(struct list_head *events, struct kobject *kobj) - - -The variables added to the uevent environment are: - -Variable Name: DM_TARGET ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: -:Value: Name of device-mapper target that generated the event. - -Variable Name: DM_ACTION ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: -:Value: Device-mapper specific action that caused the uevent action. - PATH_FAILED - A path has failed; - PATH_REINSTATED - A path has been reinstated. - -Variable Name: DM_SEQNUM ------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: unsigned integer -:Description: A sequence number for this specific device-mapper device. -:Value: Valid unsigned integer range. - -Variable Name: DM_PATH ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: Major and minor number of the path device pertaining to this - event. -:Value: Path name in the form of "Major:Minor" - -Variable Name: DM_NR_VALID_PATHS --------------------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: unsigned integer -:Description: -:Value: Valid unsigned integer range. - -Variable Name: DM_NAME ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: Name of the device-mapper device. -:Value: Name - -Variable Name: DM_UUID ----------------------- -:Uevent Action(s): KOBJ_CHANGE -:Type: string -:Description: UUID of the device-mapper device. -:Value: UUID. (Empty string if there isn't one.) - -An example of the uevents generated as captured by udevmonitor is shown -below - -1.) Path failure:: - - UEVENT[1192521009.711215] change@/block/dm-3 - ACTION=change - DEVPATH=/block/dm-3 - SUBSYSTEM=block - DM_TARGET=multipath - DM_ACTION=PATH_FAILED - DM_SEQNUM=1 - DM_PATH=8:32 - DM_NR_VALID_PATHS=0 - DM_NAME=mpath2 - DM_UUID=mpath-35333333000002328 - MINOR=3 - MAJOR=253 - SEQNUM=1130 - -2.) Path reinstate:: - - UEVENT[1192521132.989927] change@/block/dm-3 - ACTION=change - DEVPATH=/block/dm-3 - SUBSYSTEM=block - DM_TARGET=multipath - DM_ACTION=PATH_REINSTATED - DM_SEQNUM=2 - DM_PATH=8:32 - DM_NR_VALID_PATHS=1 - DM_NAME=mpath2 - DM_UUID=mpath-35333333000002328 - MINOR=3 - MAJOR=253 - SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-zoned.rst b/Documentation/device-mapper/dm-zoned.rst deleted file mode 100644 index 07f56ebc1730..000000000000 --- a/Documentation/device-mapper/dm-zoned.rst +++ /dev/null @@ -1,146 +0,0 @@ -======== -dm-zoned -======== - -The dm-zoned device mapper target exposes a zoned block device (ZBC and -ZAC compliant devices) as a regular block device without any write -pattern constraints. In effect, it implements a drive-managed zoned -block device which hides from the user (a file system or an application -doing raw block device accesses) the sequential write constraints of -host-managed zoned block devices and can mitigate the potential -device-side performance degradation due to excessive random writes on -host-aware zoned block devices. - -For a more detailed description of the zoned block device models and -their constraints see (for SCSI devices): - -http://www.t10.org/drafts.htm#ZBC_Family - -and (for ATA devices): - -http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf - -The dm-zoned implementation is simple and minimizes system overhead (CPU -and memory usage as well as storage capacity loss). For a 10TB -host-managed disk with 256 MB zones, dm-zoned memory usage per disk -instance is at most 4.5 MB and as little as 5 zones will be used -internally for storing metadata and performaing reclaim operations. - -dm-zoned target devices are formatted and checked using the dmzadm -utility available at: - -https://github.com/hgst/dm-zoned-tools - -Algorithm -========= - -dm-zoned implements an on-disk buffering scheme to handle non-sequential -write accesses to the sequential zones of a zoned block device. -Conventional zones are used for caching as well as for storing internal -metadata. - -The zones of the device are separated into 2 types: - -1) Metadata zones: these are conventional zones used to store metadata. -Metadata zones are not reported as useable capacity to the user. - -2) Data zones: all remaining zones, the vast majority of which will be -sequential zones used exclusively to store user data. The conventional -zones of the device may be used also for buffering user random writes. -Data in these zones may be directly mapped to the conventional zone, but -later moved to a sequential zone so that the conventional zone can be -reused for buffering incoming random writes. - -dm-zoned exposes a logical device with a sector size of 4096 bytes, -irrespective of the physical sector size of the backend zoned block -device being used. This allows reducing the amount of metadata needed to -manage valid blocks (blocks written). - -The on-disk metadata format is as follows: - -1) The first block of the first conventional zone found contains the -super block which describes the on disk amount and position of metadata -blocks. - -2) Following the super block, a set of blocks is used to describe the -mapping of the logical device blocks. The mapping is done per chunk of -blocks, with the chunk size equal to the zoned block device size. The -mapping table is indexed by chunk number and each mapping entry -indicates the zone number of the device storing the chunk of data. Each -mapping entry may also indicate if the zone number of a conventional -zone used to buffer random modification to the data zone. - -3) A set of blocks used to store bitmaps indicating the validity of -blocks in the data zones follows the mapping table. A valid block is -defined as a block that was written and not discarded. For a buffered -data chunk, a block is always valid only in the data zone mapping the -chunk or in the buffer zone of the chunk. - -For a logical chunk mapped to a conventional zone, all write operations -are processed by directly writing to the zone. If the mapping zone is a -sequential zone, the write operation is processed directly only if the -write offset within the logical chunk is equal to the write pointer -offset within of the sequential data zone (i.e. the write operation is -aligned on the zone write pointer). Otherwise, write operations are -processed indirectly using a buffer zone. In that case, an unused -conventional zone is allocated and assigned to the chunk being -accessed. Writing a block to the buffer zone of a chunk will -automatically invalidate the same block in the sequential zone mapping -the chunk. If all blocks of the sequential zone become invalid, the zone -is freed and the chunk buffer zone becomes the primary zone mapping the -chunk, resulting in native random write performance similar to a regular -block device. - -Read operations are processed according to the block validity -information provided by the bitmaps. Valid blocks are read either from -the sequential zone mapping a chunk, or if the chunk is buffered, from -the buffer zone assigned. If the accessed chunk has no mapping, or the -accessed blocks are invalid, the read buffer is zeroed and the read -operation terminated. - -After some time, the limited number of convnetional zones available may -be exhausted (all used to map chunks or buffer sequential zones) and -unaligned writes to unbuffered chunks become impossible. To avoid this -situation, a reclaim process regularly scans used conventional zones and -tries to reclaim the least recently used zones by copying the valid -blocks of the buffer zone to a free sequential zone. Once the copy -completes, the chunk mapping is updated to point to the sequential zone -and the buffer zone freed for reuse. - -Metadata Protection -=================== - -To protect metadata against corruption in case of sudden power loss or -system crash, 2 sets of metadata zones are used. One set, the primary -set, is used as the main metadata region, while the secondary set is -used as a staging area. Modified metadata is first written to the -secondary set and validated by updating the super block in the secondary -set, a generation counter is used to indicate that this set contains the -newest metadata. Once this operation completes, in place of metadata -block updates can be done in the primary metadata set. This ensures that -one of the set is always consistent (all modifications committed or none -at all). Flush operations are used as a commit point. Upon reception of -a flush request, metadata modification activity is temporarily blocked -(for both incoming BIO processing and reclaim process) and all dirty -metadata blocks are staged and updated. Normal operation is then -resumed. Flushing metadata thus only temporarily delays write and -discard requests. Read requests can be processed concurrently while -metadata flush is being executed. - -Usage -===== - -A zoned block device must first be formatted using the dmzadm tool. This -will analyze the device zone configuration, determine where to place the -metadata sets on the device and initialize the metadata sets. - -Ex:: - - dmzadm --format /dev/sdxx - -For a formatted device, the target can be created normally with the -dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex:: - - echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ - dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/device-mapper/era.rst b/Documentation/device-mapper/era.rst deleted file mode 100644 index 90dd5c670b9f..000000000000 --- a/Documentation/device-mapper/era.rst +++ /dev/null @@ -1,116 +0,0 @@ -====== -dm-era -====== - -Introduction -============ - -dm-era is a target that behaves similar to the linear target. In -addition it keeps track of which blocks were written within a user -defined period of time called an 'era'. Each era target instance -maintains the current era as a monotonically increasing 32-bit -counter. - -Use cases include tracking changed blocks for backup software, and -partially invalidating the contents of a cache to restore cache -coherency after rolling back a vendor snapshot. - -Constructor -=========== - -era - - ================ ====================================================== - metadata dev fast device holding the persistent metadata - origin dev device holding data blocks that may change - block size block size of origin data device, granularity that is - tracked by the target - ================ ====================================================== - -Messages -======== - -None of the dm messages take any arguments. - -checkpoint ----------- - -Possibly move to a new era. You shouldn't assume the era has -incremented. After sending this message, you should check the -current era via the status line. - -take_metadata_snap ------------------- - -Create a clone of the metadata, to allow a userland process to read it. - -drop_metadata_snap ------------------- - -Drop the metadata snapshot. - -Status -====== - - <#used metadata blocks>/<#total metadata blocks> - - -========================= ============================================== -metadata block size Fixed block size for each metadata block in - sectors -#used metadata blocks Number of metadata blocks used -#total metadata blocks Total number of metadata blocks -current era The current era -held metadata root The location, in blocks, of the metadata root - that has been 'held' for userspace read - access. '-' indicates there is no held root -========================= ============================================== - -Detailed use case -================= - -The scenario of invalidating a cache when rolling back a vendor -snapshot was the primary use case when developing this target: - -Taking a vendor snapshot ------------------------- - -- Send a checkpoint message to the era target -- Make a note of the current era in its status line -- Take vendor snapshot (the era and snapshot should be forever - associated now). - -Rolling back to an vendor snapshot ----------------------------------- - -- Cache enters passthrough mode (see: dm-cache's docs in cache.txt) -- Rollback vendor storage -- Take metadata snapshot -- Ascertain which blocks have been written since the snapshot was taken - by checking each block's era -- Invalidate those blocks in the caching software -- Cache returns to writeback/writethrough mode - -Memory usage -============ - -The target uses a bitset to record writes in the current era. It also -has a spare bitset ready for switching over to a new era. Other than -that it uses a few 4k blocks for updating metadata:: - - (4 * nr_blocks) bytes + buffers - -Resilience -========== - -Metadata is updated on disk before a write to a previously unwritten -block is performed. As such dm-era should not be effected by a hard -crash such as power failure. - -Userland tools -============== - -Userland tools are found in the increasingly poorly named -thin-provisioning-tools project: - - https://github.com/jthornber/thin-provisioning-tools diff --git a/Documentation/device-mapper/index.rst b/Documentation/device-mapper/index.rst deleted file mode 100644 index 105e253bc231..000000000000 --- a/Documentation/device-mapper/index.rst +++ /dev/null @@ -1,44 +0,0 @@ -:orphan: - -============= -Device Mapper -============= - -.. toctree:: - :maxdepth: 1 - - cache-policies - cache - delay - dm-crypt - dm-flakey - dm-init - dm-integrity - dm-io - dm-log - dm-queue-length - dm-raid - dm-service-time - dm-uevent - dm-zoned - era - kcopyd - linear - log-writes - persistent-data - snapshot - statistics - striped - switch - thin-provisioning - unstriped - verity - writecache - zero - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/device-mapper/kcopyd.rst b/Documentation/device-mapper/kcopyd.rst deleted file mode 100644 index 7651d395127f..000000000000 --- a/Documentation/device-mapper/kcopyd.rst +++ /dev/null @@ -1,47 +0,0 @@ -====== -kcopyd -====== - -Kcopyd provides the ability to copy a range of sectors from one block-device -to one or more other block-devices, with an asynchronous completion -notification. It is used by dm-snapshot and dm-mirror. - -Users of kcopyd must first create a client and indicate how many memory pages -to set aside for their copy jobs. This is done with a call to -kcopyd_client_create():: - - int kcopyd_client_create(unsigned int num_pages, - struct kcopyd_client **result); - -To start a copy job, the user must set up io_region structures to describe -the source and destinations of the copy. Each io_region indicates a -block-device along with the starting sector and size of the region. The source -of the copy is given as one io_region structure, and the destinations of the -copy are given as an array of io_region structures:: - - struct io_region { - struct block_device *bdev; - sector_t sector; - sector_t count; - }; - -To start the copy, the user calls kcopyd_copy(), passing in the client -pointer, pointers to the source and destination io_regions, the name of a -completion callback routine, and a pointer to some context data for the copy:: - - int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, - unsigned int num_dests, struct io_region *dests, - unsigned int flags, kcopyd_notify_fn fn, void *context); - - typedef void (*kcopyd_notify_fn)(int read_err, unsigned int write_err, - void *context); - -When the copy completes, kcopyd will call the user's completion routine, -passing back the user's context pointer. It will also indicate if a read or -write error occurred during the copy. - -When a user is done with all their copy jobs, they should call -kcopyd_client_destroy() to delete the kcopyd client, which will release the -associated memory pages:: - - void kcopyd_client_destroy(struct kcopyd_client *kc); diff --git a/Documentation/device-mapper/linear.rst b/Documentation/device-mapper/linear.rst deleted file mode 100644 index 9d17fc6e64a9..000000000000 --- a/Documentation/device-mapper/linear.rst +++ /dev/null @@ -1,63 +0,0 @@ -========= -dm-linear -========= - -Device-Mapper's "linear" target maps a linear range of the Device-Mapper -device onto a linear range of another device. This is the basic building -block of logical volume managers. - -Parameters: - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - - -Example scripts -=============== - -:: - - #!/bin/sh - # Create an identity mapping for a device - echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity - -:: - - #!/bin/sh - # Join 2 devices together - size1=`blockdev --getsz $1` - size2=`blockdev --getsz $2` - echo "0 $size1 linear $1 0 - $size1 $size2 linear $2 0" | dmsetup create joined - -:: - - #!/usr/bin/perl -w - # Split a device into 4M chunks and then join them together in reverse order. - - my $name = "reverse"; - my $extent_size = 4 * 1024 * 2; - my $dev = $ARGV[0]; - my $table = ""; - my $count = 0; - - if (!defined($dev)) { - die("Please specify a device.\n"); - } - - my $dev_size = `blockdev --getsz $dev`; - my $extents = int($dev_size / $extent_size) - - (($dev_size % $extent_size) ? 1 : 0); - - while ($extents > 0) { - my $this_start = $count * $extent_size; - $extents--; - $count++; - my $this_offset = $extents * $extent_size; - - $table .= "$this_start $extent_size linear $dev $this_offset\n"; - } - - `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/device-mapper/log-writes.rst b/Documentation/device-mapper/log-writes.rst deleted file mode 100644 index 23141f2ffb7c..000000000000 --- a/Documentation/device-mapper/log-writes.rst +++ /dev/null @@ -1,145 +0,0 @@ -============= -dm-log-writes -============= - -This target takes 2 devices, one to pass all IO to normally, and one to log all -of the write operations to. This is intended for file system developers wishing -to verify the integrity of metadata or data as the file system is written to. -There is a log_write_entry written for every WRITE request and the target is -able to take arbitrary data from userspace to insert into the log. The data -that is in the WRITE requests is copied into the log to make the replay happen -exactly as it happened originally. - -Log Ordering -============ - -We log things in order of completion once we are sure the write is no longer in -cache. This means that normal WRITE requests are not actually logged until the -next REQ_PREFLUSH request. This is to make it easier for userspace to replay -the log in a way that correlates to what is on disk and not what is in cache, -to make it easier to detect improper waiting/flushing. - -This works by attaching all WRITE requests to a list once the write completes. -Once we see a REQ_PREFLUSH request we splice this list onto the request and once -the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only -completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to -simulate the worst case scenario with regard to power failures. Consider the -following example (W means write, C means complete): - - W1,W2,W3,C3,C2,Wflush,C1,Cflush - -The log would show the following: - - W3,W2,flush,W1.... - -Again this is to simulate what is actually on disk, this allows us to detect -cases where a power failure at a particular point in time would create an -inconsistent file system. - -Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as -they complete as those requests will obviously bypass the device cache. - -Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would -have all the DISCARD requests, and then the WRITE requests and then the FLUSH -request. Consider the following example: - - WRITE block 1, DISCARD block 1, FLUSH - -If we logged DISCARD when it completed, the replay would look like this: - - DISCARD 1, WRITE 1, FLUSH - -which isn't quite what happened and wouldn't be caught during the log replay. - -Target interface -================ - -i) Constructor - - log-writes - - ============= ============================================== - dev_path Device that all of the IO will go to normally. - log_dev_path Device where the log entries are written to. - ============= ============================================== - -ii) Status - - <#logged entries> - - =========================== ======================== - #logged entries Number of logged entries - highest allocated sector Highest allocated sector - =========================== ======================== - -iii) Messages - - mark - - You can use a dmsetup message to set an arbitrary mark in a log. - For example say you want to fsck a file system after every - write, but first you need to replay up to the mkfs to make sure - we're fsck'ing something reasonable, you would do something like - this:: - - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - - This would allow you to replay the log up to the mkfs mark and - then replay from that point on doing the fsck check in the - interval that you want. - - Every log has a mark at the end labeled "dm-log-writes-end". - -Userspace component -=================== - -There is a userspace tool that will replay the log for you in various ways. -It can be found here: https://github.com/josefbacik/log-writes - -Example usage -============= - -Say you want to test fsync on your file system. You would do something like -this:: - - TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" - dmsetup create log --table "$TABLE" - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - mount /dev/mapper/log /mnt/btrfs-test - - dmsetup message log 0 mark fsync - md5sum /mnt/btrfs-test/foo - umount /mnt/btrfs-test - - dmsetup remove log - replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync - mount /dev/sdb /mnt/btrfs-test - md5sum /mnt/btrfs-test/foo - - - Another option is to do a complicated file system operation and verify the file - system is consistent during the entire operation. You could do this with: - - TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" - dmsetup create log --table "$TABLE" - mkfs.btrfs -f /dev/mapper/log - dmsetup message log 0 mark mkfs - - mount /dev/mapper/log /mnt/btrfs-test - - btrfs filesystem balance /mnt/btrfs-test - umount /mnt/btrfs-test - dmsetup remove log - - replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs - btrfsck /dev/sdb - replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ - --fsck "btrfsck /dev/sdb" --check fua - -And that will replay the log until it sees a FUA request, run the fsck command -and if the fsck passes it will replay to the next FUA, until it is completed or -the fsck command exists abnormally. diff --git a/Documentation/device-mapper/persistent-data.rst b/Documentation/device-mapper/persistent-data.rst deleted file mode 100644 index 2065c3c5a091..000000000000 --- a/Documentation/device-mapper/persistent-data.rst +++ /dev/null @@ -1,88 +0,0 @@ -=============== -Persistent data -=============== - -Introduction -============ - -The more-sophisticated device-mapper targets require complex metadata -that is managed in kernel. In late 2010 we were seeing that various -different targets were rolling their own data structures, for example: - -- Mikulas Patocka's multisnap implementation -- Heinz Mauelshagen's thin provisioning target -- Another btree-based caching target posted to dm-devel -- Another multi-snapshot target based on a design of Daniel Phillips - -Maintaining these data structures takes a lot of work, so if possible -we'd like to reduce the number. - -The persistent-data library is an attempt to provide a re-usable -framework for people who want to store metadata in device-mapper -targets. It's currently used by the thin-provisioning target and an -upcoming hierarchical storage target. - -Overview -======== - -The main documentation is in the header files which can all be found -under drivers/md/persistent-data. - -The block manager ------------------ - -dm-block-manager.[hc] - -This provides access to the data on disk in fixed sized-blocks. There -is a read/write locking interface to prevent concurrent accesses, and -keep data that is being used in the cache. - -Clients of persistent-data are unlikely to use this directly. - -The transaction manager ------------------------ - -dm-transaction-manager.[hc] - -This restricts access to blocks and enforces copy-on-write semantics. -The only way you can get hold of a writable block through the -transaction manager is by shadowing an existing block (ie. doing -copy-on-write) or allocating a fresh one. Shadowing is elided within -the same transaction so performance is reasonable. The commit method -ensures that all data is flushed before it writes the superblock. -On power failure your metadata will be as it was when last committed. - -The Space Maps --------------- - -dm-space-map.h -dm-space-map-metadata.[hc] -dm-space-map-disk.[hc] - -On-disk data structures that keep track of reference counts of blocks. -Also acts as the allocator of new blocks. Currently two -implementations: a simpler one for managing blocks on a different -device (eg. thinly-provisioned data blocks); and one for managing -the metadata space. The latter is complicated by the need to store -its own data within the space it's managing. - -The data structures -------------------- - -dm-btree.[hc] -dm-btree-remove.c -dm-btree-spine.c -dm-btree-internal.h - -Currently there is only one data structure, a hierarchical btree. -There are plans to add more. For example, something with an -array-like interface would see a lot of use. - -The btree is 'hierarchical' in that you can define it to be composed -of nested btrees, and take multiple keys. For example, the -thin-provisioning target uses a btree with two levels of nesting. -The first maps a device id to a mapping tree, and that in turn maps a -virtual block to a physical block. - -Values stored in the btrees can have arbitrary size. Keys are always -64bits, although nesting allows you to use multiple keys. diff --git a/Documentation/device-mapper/snapshot.rst b/Documentation/device-mapper/snapshot.rst deleted file mode 100644 index ccdd8b587a74..000000000000 --- a/Documentation/device-mapper/snapshot.rst +++ /dev/null @@ -1,196 +0,0 @@ -============================== -Device-mapper snapshot support -============================== - -Device-mapper allows you, without massive data copying: - -- To create snapshots of any block device i.e. mountable, saved states of - the block device which are also writable without interfering with the - original content; -- To create device "forks", i.e. multiple different versions of the - same data stream. -- To merge a snapshot of a block device back into the snapshot's origin - device. - -In the first two cases, dm copies only the chunks of data that get -changed and uses a separate copy-on-write (COW) block device for -storage. - -For snapshot merge the contents of the COW storage are merged back into -the origin device. - - -There are three dm targets available: -snapshot, snapshot-origin, and snapshot-merge. - -- snapshot-origin - -which will normally have one or more snapshots based on it. -Reads will be mapped directly to the backing device. For each write, the -original data will be saved in the of each snapshot to keep -its visible content unchanged, at least until the fills up. - - -- snapshot - [<# feature args> []*] - -A snapshot of the block device is created. Changed chunks of - sectors will be stored on the . Writes will -only go to the . Reads will come from the or -from for unchanged data. will often be -smaller than the origin and if it fills up the snapshot will become -useless and be disabled, returning errors. So it is important to monitor -the amount of free space and expand the before it fills up. - - is P (Persistent) or N (Not persistent - will not survive -after reboot). O (Overflow) can be added as a persistent store option -to allow userspace to advertise its support for seeing "Overflow" in the -snapshot status. So supported store types are "P", "PO" and "N". - -The difference between persistent and transient is with transient -snapshots less metadata must be saved on disk - they can be kept in -memory by the kernel. - -When loading or unloading the snapshot target, the corresponding -snapshot-origin or snapshot-merge target must be suspended. A failure to -suspend the origin target could result in data corruption. - -Optional features: - - discard_zeroes_cow - a discard issued to the snapshot device that - maps to entire chunks to will zero the corresponding exception(s) in - the snapshot's exception store. - - discard_passdown_origin - a discard to the snapshot device is passed - down to the snapshot-origin's underlying device. This doesn't cause - copy-out to the snapshot exception store because the snapshot-origin - target is bypassed. - - The discard_passdown_origin feature depends on the discard_zeroes_cow - feature being enabled. - - -- snapshot-merge - [<# feature args> []*] - -takes the same table arguments as the snapshot target except it only -works with persistent snapshots. This target assumes the role of the -"snapshot-origin" target and must not be loaded if the "snapshot-origin" -is still present for . - -Creates a merging snapshot that takes control of the changed chunks -stored in the of an existing snapshot, through a handover -procedure, and merges these chunks back into the . Once merging -has started (in the background) the may be opened and the merge -will continue while I/O is flowing to it. Changes to the are -deferred until the merging snapshot's corresponding chunk(s) have been -merged. Once merging has started the snapshot device, associated with -the "snapshot" target, will return -EIO when accessed. - - -How snapshot is used by LVM2 -============================ -When you create the first LVM2 snapshot of a volume, four dm devices are used: - -1) a device containing the original mapping table of the source volume; -2) a device used as the ; -3) a "snapshot" device, combining #1 and #2, which is the visible snapshot - volume; -4) the "original" volume (which uses the device number used by the original - source volume), whose table is replaced by a "snapshot-origin" mapping - from device #1. - -A fixed naming scheme is used, so with the following commands:: - - lvcreate -L 1G -n base volumeGroup - lvcreate -L 100M --snapshot -n snap volumeGroup/base - -we'll have this situation (with volumes in above order):: - - # dmsetup table|grep volumeGroup - - volumeGroup-base-real: 0 2097152 linear 8:19 384 - volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 - volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 - volumeGroup-base: 0 2097152 snapshot-origin 254:11 - - # ls -lL /dev/mapper/volumeGroup-* - brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real - brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow - brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap - brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base - - -How snapshot-merge is used by LVM2 -================================== -A merging snapshot assumes the role of the "snapshot-origin" while -merging. As such the "snapshot-origin" is replaced with -"snapshot-merge". The "-real" device is not changed and the "-cow" -device is renamed to -cow to aid LVM2's cleanup of the -merging snapshot after it completes. The "snapshot" that hands over its -COW device to the "snapshot-merge" is deactivated (unless using lvchange ---refresh); but if it is left active it will simply return I/O errors. - -A snapshot will merge into its origin with the following command:: - - lvconvert --merge volumeGroup/snap - -we'll now have this situation:: - - # dmsetup table|grep volumeGroup - - volumeGroup-base-real: 0 2097152 linear 8:19 384 - volumeGroup-base-cow: 0 204800 linear 8:19 2097536 - volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 - - # ls -lL /dev/mapper/volumeGroup-* - brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real - brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow - brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base - - -How to determine when a merging is complete -=========================================== -The snapshot-merge and snapshot status lines end with: - - / - -Both and include both data and metadata. -During merging, the number of sectors allocated gets smaller and -smaller. Merging has finished when the number of sectors holding data -is zero, in other words == . - -Here is a practical example (using a hybrid of lvm and dmsetup commands):: - - # lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g - snap volumeGroup swi-a- 1.00g base 18.97 - - # dmsetup status volumeGroup-snap - 0 8388608 snapshot 397896/2097152 1560 - ^^^^ metadata sectors - - # lvconvert --merge -b volumeGroup/snap - Merging of volume snap started. - - # lvs volumeGroup/snap - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup Owi-a- 4.00g 17.23 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 281688/2097152 1104 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 180480/2097152 712 - - # dmsetup status volumeGroup-base - 0 8388608 snapshot-merge 16/2097152 16 - -Merging has finished. - -:: - - # lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g diff --git a/Documentation/device-mapper/statistics.rst b/Documentation/device-mapper/statistics.rst deleted file mode 100644 index 3d80a9f850cc..000000000000 --- a/Documentation/device-mapper/statistics.rst +++ /dev/null @@ -1,225 +0,0 @@ -============= -DM statistics -============= - -Device Mapper supports the collection of I/O statistics on user-defined -regions of a DM device. If no regions are defined no statistics are -collected so there isn't any performance impact. Only bio-based DM -devices are currently supported. - -Each user-defined region specifies a starting sector, length and step. -Individual statistics will be collected for each step-sized area within -the range specified. - -The I/O statistics counters for each step-sized area of a region are -in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are -provided: total time spent reading and writing. When the histogram -argument is used, the 14th parameter is reported that represents the -histogram of latencies. All these counters may be accessed by sending -the @stats_print message to the appropriate DM device via dmsetup. - -The reported times are in milliseconds and the granularity depends on -the kernel ticks. When the option precise_timestamps is used, the -reported times are in nanoseconds. - -Each region has a corresponding unique identifier, which we call a -region_id, that is assigned when the region is created. The region_id -must be supplied when querying statistics about the region, deleting the -region, etc. Unique region_ids enable multiple userspace programs to -request and process statistics for the same DM device without stepping -on each other's data. - -The creation of DM statistics will allocate memory via kmalloc or -fallback to using vmalloc space. At most, 1/4 of the overall system -memory may be allocated by DM statistics. The admin can see how much -memory is used by reading: - - /sys/module/dm_mod/parameters/stats_current_allocated_bytes - -Messages -======== - - @stats_create [ ...] [ []] - Create a new region and return the region_id. - - - "-" - whole device - "+" - a range of 512-byte sectors - starting with . - - - "" - the range is subdivided into areas each containing - sectors. - "/" - the range is subdivided into the specified - number of areas. - - - The number of optional arguments - - - The following optional arguments are supported: - - precise_timestamps - use precise timer with nanosecond resolution - instead of the "jiffies" variable. When this argument is - used, the resulting times are in nanoseconds instead of - milliseconds. Precise timestamps are a little bit slower - to obtain than jiffies-based timestamps. - histogram:n1,n2,n3,n4,... - collect histogram of latencies. The - numbers n1, n2, etc are times that represent the boundaries - of the histogram. If precise_timestamps is not used, the - times are in milliseconds, otherwise they are in - nanoseconds. For each range, the kernel will report the - number of requests that completed within this range. For - example, if we use "histogram:10,20,30", the kernel will - report four numbers a:b:c:d. a is the number of requests - that took 0-10 ms to complete, b is the number of requests - that took 10-20 ms to complete, c is the number of requests - that took 20-30 ms to complete and d is the number of - requests that took more than 30 ms to complete. - - - An optional parameter. A name that uniquely identifies - the userspace owner of the range. This groups ranges together - so that userspace programs can identify the ranges they - created and ignore those created by others. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use it for anything else. - If we omit the number of optional arguments, program id must not - be a number, otherwise it would be interpreted as the number of - optional arguments. - - - An optional parameter. A word that provides auxiliary data - that is useful to the client program that created the range. - The kernel returns this string back in the output of - @stats_list message, but it doesn't use this value for anything. - - @stats_delete - Delete the region with the specified id. - - - region_id returned from @stats_create - - @stats_clear - Clear all the counters except the in-flight i/o counters. - - - region_id returned from @stats_create - - @stats_list [] - List all regions registered with @stats_create. - - - An optional parameter. - If this parameter is specified, only matching regions - are returned. - If it is not specified, all regions are returned. - - Output format: - : + - precise_timestamps histogram:n1,n2,n3,... - - The strings "precise_timestamps" and "histogram" are printed only - if they were specified when creating the region. - - @stats_print [ ] - Print counters for each step-sized area of a region. - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are returned. - - - The number of lines to include in the output. - If omitted, all lines are returned. - - Output format for each step-sized area of a region: - - + - counters - - The first 11 counters have the same meaning as - `/sys/block/*/stat or /proc/diskstats`. - - Please refer to Documentation/iostats.txt for details. - - 1. the number of reads completed - 2. the number of reads merged - 3. the number of sectors read - 4. the number of milliseconds spent reading - 5. the number of writes completed - 6. the number of writes merged - 7. the number of sectors written - 8. the number of milliseconds spent writing - 9. the number of I/Os currently in progress - 10. the number of milliseconds spent doing I/Os - 11. the weighted number of milliseconds spent doing I/Os - - Additional counters: - - 12. the total time spent reading in milliseconds - 13. the total time spent writing in milliseconds - - @stats_print_clear [ ] - Atomically print and then clear all the counters except the - in-flight i/o counters. Useful when the client consuming the - statistics does not want to lose any statistics (those updated - between printing and clearing). - - - region_id returned from @stats_create - - - The index of the starting line in the output. - If omitted, all lines are printed and then cleared. - - - The number of lines to process. - If omitted, all lines are printed and then cleared. - - @stats_set_aux - Store auxiliary data aux_data for the specified region. - - - region_id returned from @stats_create - - - The string that identifies data which is useful to the client - program that created the range. The kernel returns this - string back in the output of @stats_list message, but it - doesn't use this value for anything. - -Examples -======== - -Subdivide the DM device 'vol' into 100 pieces and start collecting -statistics on them:: - - dmsetup message vol 0 @stats_create - /100 - -Set the auxiliary data string to "foo bar baz" (the escape for each -space must also be escaped, otherwise the shell will consume them):: - - dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz - -List the statistics:: - - dmsetup message vol 0 @stats_list - -Print the statistics:: - - dmsetup message vol 0 @stats_print 0 - -Delete the statistics:: - - dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/device-mapper/striped.rst b/Documentation/device-mapper/striped.rst deleted file mode 100644 index e9a8da192ae1..000000000000 --- a/Documentation/device-mapper/striped.rst +++ /dev/null @@ -1,61 +0,0 @@ -========= -dm-stripe -========= - -Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) -device across one or more underlying devices. Data is written in "chunks", -with consecutive chunks rotating among the underlying devices. This can -potentially provide improved I/O throughput by utilizing several physical -devices in parallel. - -Parameters: [ ]+ - : - Number of underlying devices. - : - Size of each chunk of data. Must be at least as - large as the system's PAGE_SIZE. - : - Full pathname to the underlying block-device, or a - "major:minor" device-number. - : - Starting sector within the device. - -One or more underlying devices can be specified. The striped device size must -be a multiple of the chunk size multiplied by the number of underlying devices. - - -Example scripts -=============== - -:: - - #!/usr/bin/perl -w - # Create a striped device across any number of underlying devices. The device - # will be called "stripe_dev" and have a chunk-size of 128k. - - my $chunk_size = 128 * 2; - my $dev_name = "stripe_dev"; - my $num_devs = @ARGV; - my @devs = @ARGV; - my ($min_dev_size, $stripe_dev_size, $i); - - if (!$num_devs) { - die("Specify at least one device\n"); - } - - $min_dev_size = `blockdev --getsz $devs[0]`; - for ($i = 1; $i < $num_devs; $i++) { - my $this_size = `blockdev --getsz $devs[$i]`; - $min_dev_size = ($min_dev_size < $this_size) ? - $min_dev_size : $this_size; - } - - $stripe_dev_size = $min_dev_size * $num_devs; - $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); - - $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; - for ($i = 0; $i < $num_devs; $i++) { - $table .= " $devs[$i] 0"; - } - - `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/device-mapper/switch.rst b/Documentation/device-mapper/switch.rst deleted file mode 100644 index 7dde06be1a4f..000000000000 --- a/Documentation/device-mapper/switch.rst +++ /dev/null @@ -1,141 +0,0 @@ -========= -dm-switch -========= - -The device-mapper switch target creates a device that supports an -arbitrary mapping of fixed-size regions of I/O across a fixed set of -paths. The path used for any specific region can be switched -dynamically by sending the target a message. - -It maps I/O to underlying block devices efficiently when there is a large -number of fixed-sized address regions but there is no simple pattern -that would allow for a compact representation of the mapping such as -dm-stripe. - -Background ----------- - -Dell EqualLogic and some other iSCSI storage arrays use a distributed -frameless architecture. In this architecture, the storage group -consists of a number of distinct storage arrays ("members") each having -independent controllers, disk storage and network adapters. When a LUN -is created it is spread across multiple members. The details of the -spreading are hidden from initiators connected to this storage system. -The storage group exposes a single target discovery portal, no matter -how many members are being used. When iSCSI sessions are created, each -session is connected to an eth port on a single member. Data to a LUN -can be sent on any iSCSI session, and if the blocks being accessed are -stored on another member the I/O will be forwarded as required. This -forwarding is invisible to the initiator. The storage layout is also -dynamic, and the blocks stored on disk may be moved from member to -member as needed to balance the load. - -This architecture simplifies the management and configuration of both -the storage group and initiators. In a multipathing configuration, it -is possible to set up multiple iSCSI sessions to use multiple network -interfaces on both the host and target to take advantage of the -increased network bandwidth. An initiator could use a simple round -robin algorithm to send I/O across all paths and let the storage array -members forward it as necessary, but there is a performance advantage to -sending data directly to the correct member. - -A device-mapper table already lets you map different regions of a -device onto different targets. However in this architecture the LUN is -spread with an address region size on the order of 10s of MBs, which -means the resulting table could have more than a million entries and -consume far too much memory. - -Using this device-mapper switch target we can now build a two-layer -device hierarchy: - - Upper Tier - Determine which array member the I/O should be sent to. - Lower Tier - Load balance amongst paths to a particular member. - -The lower tier consists of a single dm multipath device for each member. -Each of these multipath devices contains the set of paths directly to -the array member in one priority group, and leverages existing path -selectors to load balance amongst these paths. We also build a -non-preferred priority group containing paths to other array members for -failover reasons. - -The upper tier consists of a single dm-switch device. This device uses -a bitmap to look up the location of the I/O and choose the appropriate -lower tier device to route the I/O. By using a bitmap we are able to -use 4 bits for each address range in a 16 member group (which is very -large for us). This is a much denser representation than the dm table -b-tree can achieve. - -Construction Parameters -======================= - - [...] [ ]+ - - The number of paths across which to distribute the I/O. - - - The number of 512-byte sectors in a region. Each region can be redirected - to any of the available paths. - - - The number of optional arguments. Currently, no optional arguments - are supported and so this must be zero. - - - The block device that represents a specific path to the device. - - - The offset of the start of data on the specific (in units - of 512-byte sectors). This number is added to the sector number when - forwarding the request to the specific path. Typically it is zero. - -Messages -======== - -set_region_mappings : []: []:... - -Modify the region table by specifying which regions are redirected to -which paths. - - - The region number (region size was specified in constructor parameters). - If index is omitted, the next region (previous index + 1) is used. - Expressed in hexadecimal (WITHOUT any prefix like 0x). - - - The path number in the range 0 ... ( - 1). - Expressed in hexadecimal (WITHOUT any prefix like 0x). - -R, - This parameter allows repetitive patterns to be loaded quickly. and - are hexadecimal numbers. The last mappings are repeated in the next - slots. - -Status -====== - -No status line is reported. - -Example -======= - -Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with -the same size. - -Create a switch device with 64kB region size:: - - dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` - switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" - -Set mappings for the first 7 entries to point to devices switch0, switch1, -switch2, switch0, switch1, switch2, switch1:: - - dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 - -Set repetitive mapping. This command:: - - dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 - -is equivalent to:: - - dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ - :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 diff --git a/Documentation/device-mapper/thin-provisioning.rst b/Documentation/device-mapper/thin-provisioning.rst deleted file mode 100644 index bafebf79da4b..000000000000 --- a/Documentation/device-mapper/thin-provisioning.rst +++ /dev/null @@ -1,427 +0,0 @@ -================= -Thin provisioning -================= - -Introduction -============ - -This document describes a collection of device-mapper targets that -between them implement thin-provisioning and snapshots. - -The main highlight of this implementation, compared to the previous -implementation of snapshots, is that it allows many virtual devices to -be stored on the same data volume. This simplifies administration and -allows the sharing of data between volumes, thus reducing disk usage. - -Another significant feature is support for an arbitrary depth of -recursive snapshots (snapshots of snapshots of snapshots ...). The -previous implementation of snapshots did this by chaining together -lookup tables, and so performance was O(depth). This new -implementation uses a single data structure to avoid this degradation -with depth. Fragmentation may still be an issue, however, in some -scenarios. - -Metadata is stored on a separate device from data, giving the -administrator some freedom, for example to: - -- Improve metadata resilience by storing metadata on a mirrored volume - but data on a non-mirrored one. - -- Improve performance by storing the metadata on SSD. - -Status -====== - -These targets are considered safe for production use. But different use -cases will have different performance characteristics, for example due -to fragmentation of the data volume. - -If you find this software is not performing as expected please mail -dm-devel@redhat.com with details and we'll try our best to improve -things for you. - -Userspace tools for checking and repairing the metadata have been fully -developed and are available as 'thin_check' and 'thin_repair'. The name -of the package that provides these utilities varies by distribution (on -a Red Hat distribution it is named 'device-mapper-persistent-data'). - -Cookbook -======== - -This section describes some quick recipes for using thin provisioning. -They use the dmsetup program to control the device-mapper driver -directly. End users will be advised to use a higher-level volume -manager such as LVM2 once support has been added. - -Pool device ------------ - -The pool device ties together the metadata volume and the data volume. -It maps I/O linearly to the data volume and updates the metadata via -two mechanisms: - -- Function calls from the thin targets - -- Device-mapper 'messages' from userspace which control the creation of new - virtual devices amongst other things. - -Setting up a fresh pool device ------------------------------- - -Setting up a pool device requires a valid metadata device, and a -data device. If you do not have an existing metadata device you can -make one by zeroing the first 4k to indicate empty metadata. - - dd if=/dev/zero of=$metadata_dev bs=4096 count=1 - -The amount of metadata you need will vary according to how many blocks -are shared between thin devices (i.e. through snapshots). If you have -less sharing than average you'll need a larger-than-average metadata device. - -As a guide, we suggest you calculate the number of bytes to use in the -metadata device as 48 * $data_dev_size / $data_block_size but round it up -to 2MB if the answer is smaller. If you're creating large numbers of -snapshots which are recording large amounts of change, you may find you -need to increase this. - -The largest size supported is 16GB: If the device is larger, -a warning will be issued and the excess space will not be used. - -Reloading a pool table ----------------------- - -You may reload a pool's table, indeed this is how the pool is resized -if it runs out of space. (N.B. While specifying a different metadata -device when reloading is not forbidden at the moment, things will go -wrong if it does not route I/O to exactly the same on-disk location as -previously.) - -Using an existing pool device ------------------------------ - -:: - - dmsetup create pool \ - --table "0 20971520 thin-pool $metadata_dev $data_dev \ - $data_block_size $low_water_mark" - -$data_block_size gives the smallest unit of disk space that can be -allocated at a time expressed in units of 512-byte sectors. -$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a -multiple of 128 (64KB). $data_block_size cannot be changed after the -thin-pool is created. People primarily interested in thin provisioning -may want to use a value such as 1024 (512KB). People doing lots of -snapshotting may want a smaller value such as 128 (64KB). If you are -not zeroing newly-allocated data, a larger $data_block_size in the -region of 256000 (128MB) is suggested. - -$low_water_mark is expressed in blocks of size $data_block_size. If -free space on the data device drops below this level then a dm event -will be triggered which a userspace daemon should catch allowing it to -extend the pool device. Only one such event will be sent. - -No special event is triggered if a just resumed device's free space is below -the low water mark. However, resuming a device always triggers an -event; a userspace daemon should verify that free space exceeds the low -water mark when handling this event. - -A low water mark for the metadata device is maintained in the kernel and -will trigger a dm event if free space on the metadata device drops below -it. - -Updating on-disk metadata -------------------------- - -On-disk metadata is committed every time a FLUSH or FUA bio is written. -If no such requests are made then commits will occur every second. This -means the thin-provisioning target behaves like a physical disk that has -a volatile write cache. If power is lost you may lose some recent -writes. The metadata should always be consistent in spite of any crash. - -If data space is exhausted the pool will either error or queue IO -according to the configuration (see: error_if_no_space). If metadata -space is exhausted or a metadata operation fails: the pool will error IO -until the pool is taken offline and repair is performed to 1) fix any -potential inconsistencies and 2) clear the flag that imposes repair. -Once the pool's metadata device is repaired it may be resized, which -will allow the pool to return to normal operation. Note that if a pool -is flagged as needing repair, the pool's data and metadata devices -cannot be resized until repair is performed. It should also be noted -that when the pool's metadata space is exhausted the current metadata -transaction is aborted. Given that the pool will cache IO whose -completion may have already been acknowledged to upper IO layers -(e.g. filesystem) it is strongly suggested that consistency checks -(e.g. fsck) be performed on those layers when repair of the pool is -required. - -Thin provisioning ------------------ - -i) Creating a new thinly-provisioned volume. - - To create a new thinly- provisioned volume you must send a message to an - active pool device, /dev/mapper/pool in this example:: - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - - Here '0' is an identifier for the volume, a 24-bit number. It's up - to the caller to allocate and manage these identifiers. If the - identifier is already in use, the message will fail with -EEXIST. - -ii) Using a thinly-provisioned volume. - - Thinly-provisioned volumes are activated using the 'thin' target:: - - dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" - - The last parameter is the identifier for the thinp device. - -Internal snapshots ------------------- - -i) Creating an internal snapshot. - - Snapshots are created with another message to the pool. - - N.B. If the origin device that you wish to snapshot is active, you - must suspend it before creating the snapshot to avoid corruption. - This is NOT enforced at the moment, so please be careful! - - :: - - dmsetup suspend /dev/mapper/thin - dmsetup message /dev/mapper/pool 0 "create_snap 1 0" - dmsetup resume /dev/mapper/thin - - Here '1' is the identifier for the volume, a 24-bit number. '0' is the - identifier for the origin device. - -ii) Using an internal snapshot. - - Once created, the user doesn't have to worry about any connection - between the origin and the snapshot. Indeed the snapshot is no - different from any other thinly-provisioned device and can be - snapshotted itself via the same method. It's perfectly legal to - have only one of them active, and there's no ordering requirement on - activating or removing them both. (This differs from conventional - device-mapper snapshots.) - - Activate it exactly the same way as any other thinly-provisioned volume:: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" - -External snapshots ------------------- - -You can use an external **read only** device as an origin for a -thinly-provisioned volume. Any read to an unprovisioned area of the -thin device will be passed through to the origin. Writes trigger -the allocation of new blocks as usual. - -One use case for this is VM hosts that want to run guests on -thinly-provisioned volumes but have the base image on another device -(possibly shared between many VMs). - -You must not write to the origin device if you use this technique! -Of course, you may write to the thin device and take internal snapshots -of the thin volume. - -i) Creating a snapshot of an external device - - This is the same as creating a thin device. - You don't mention the origin at this stage. - - :: - - dmsetup message /dev/mapper/pool 0 "create_thin 0" - -ii) Using a snapshot of an external device. - - Append an extra parameter to the thin target specifying the origin:: - - dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" - - N.B. All descendants (internal snapshots) of this snapshot require the - same extra origin parameter. - -Deactivation ------------- - -All devices using a pool must be deactivated before the pool itself -can be. - -:: - - dmsetup remove thin - dmsetup remove snap - dmsetup remove pool - -Reference -========= - -'thin-pool' target ------------------- - -i) Constructor - - :: - - thin-pool \ - [ []*] - - Optional feature arguments: - - skip_block_zeroing: - Skip the zeroing of newly-provisioned blocks. - - ignore_discard: - Disable discard support. - - no_discard_passdown: - Don't pass discards down to the underlying - data device, but just remove the mapping. - - read_only: - Don't allow any changes to be made to the pool - metadata. This mode is only available after the - thin-pool has been created and first used in full - read/write mode. It cannot be specified on initial - thin-pool creation. - - error_if_no_space: - Error IOs, instead of queueing, if no space. - - Data block size must be between 64KB (128 sectors) and 1GB - (2097152 sectors) inclusive. - - -ii) Status - - :: - - / - / - ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space - needs_check|- metadata_low_watermark - - transaction id: - A 64-bit number used by userspace to help synchronise with metadata - from volume managers. - - used data blocks / total data blocks - If the number of free blocks drops below the pool's low water mark a - dm event will be sent to userspace. This event is edge-triggered and - it will occur only once after each resume so volume manager writers - should register for the event and then check the target's status. - - held metadata root: - The location, in blocks, of the metadata root that has been - 'held' for userspace read access. '-' indicates there is no - held root. - - discard_passdown|no_discard_passdown - Whether or not discards are actually being passed down to the - underlying device. When this is enabled when loading the table, - it can get disabled if the underlying device doesn't support it. - - ro|rw|out_of_data_space - If the pool encounters certain types of device failures it will - drop into a read-only metadata mode in which no changes to - the pool metadata (like allocating new blocks) are permitted. - - In serious cases where even a read-only mode is deemed unsafe - no further I/O will be permitted and the status will just - contain the string 'Fail'. The userspace recovery tools - should then be used. - - error_if_no_space|queue_if_no_space - If the pool runs out of data or metadata space, the pool will - either queue or error the IO destined to the data device. The - default is to queue the IO until more space is added or the - 'no_space_timeout' expires. The 'no_space_timeout' dm-thin-pool - module parameter can be used to change this timeout -- it - defaults to 60 seconds but may be disabled using a value of 0. - - needs_check - A metadata operation has failed, resulting in the needs_check - flag being set in the metadata's superblock. The metadata - device must be deactivated and checked/repaired before the - thin-pool can be made fully operational again. '-' indicates - needs_check is not set. - - metadata_low_watermark: - Value of metadata low watermark in blocks. The kernel sets this - value internally but userspace needs to know this value to - determine if an event was caused by crossing this threshold. - -iii) Messages - - create_thin - Create a new thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - - create_snap - Create a new snapshot of another thinly-provisioned device. - is an arbitrary unique 24-bit identifier chosen by - the caller. - is the identifier of the thinly-provisioned device - of which the new device will be a snapshot. - - delete - Deletes a thin device. Irreversible. - - set_transaction_id - Userland volume managers, such as LVM, need a way to - synchronise their external metadata with the internal metadata of the - pool target. The thin-pool target offers to store an - arbitrary 64-bit transaction id and return it on the target's - status line. To avoid races you must provide what you think - the current transaction id is when you change it with this - compare-and-swap message. - - reserve_metadata_snap - Reserve a copy of the data mapping btree for use by userland. - This allows userland to inspect the mappings as they were when - this message was executed. Use the pool's status command to - get the root block associated with the metadata snapshot. - - release_metadata_snap - Release a previously reserved copy of the data mapping btree. - -'thin' target -------------- - -i) Constructor - - :: - - thin [] - - pool dev: - the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 - - dev id: - the internal device identifier of the device to be - activated. - - external origin dev: - an optional block device outside the pool to be treated as a - read-only snapshot origin: reads to unprovisioned areas of the - thin target will be mapped to this device. - -The pool doesn't store any size against the thin devices. If you -load a thin target that is smaller than you've been using previously, -then you'll have no access to blocks mapped beyond the end. If you -load a target that is bigger than before, then extra blocks will be -provisioned as and when needed. - -ii) Status - - - If the pool has encountered device errors and failed, the status - will just contain the string 'Fail'. The userspace recovery - tools should then be used. - - In the case where is 0, there is no highest - mapped sector and the value of is unspecified. diff --git a/Documentation/device-mapper/unstriped.rst b/Documentation/device-mapper/unstriped.rst deleted file mode 100644 index 0a8d3eb3f072..000000000000 --- a/Documentation/device-mapper/unstriped.rst +++ /dev/null @@ -1,135 +0,0 @@ -================================ -Device-mapper "unstriped" target -================================ - -Introduction -============ - -The device-mapper "unstriped" target provides a transparent mechanism to -unstripe a device-mapper "striped" target to access the underlying disks -without having to touch the true backing block-device. It can also be -used to unstripe a hardware RAID-0 to access backing disks. - -Parameters: - - - - The number of stripes in the RAID 0. - - - The amount of 512B sectors in the chunk striping. - - - The block device you wish to unstripe. - - - The stripe number within the device that corresponds to physical - drive you wish to unstripe. This must be 0 indexed. - - -Why use this module? -==================== - -An example of undoing an existing dm-stripe -------------------------------------------- - -This small bash script will setup 4 loop devices and use the existing -striped target to combine the 4 devices into one. It then will use -the unstriped target ontop of the striped device to access the -individual backing loop devices. We write data to the newly exposed -unstriped devices and verify the data written matches the correct -underlying device on the striped array:: - - #!/bin/bash - - MEMBER_SIZE=$((128 * 1024 * 1024)) - NUM=4 - SEQ_END=$((${NUM}-1)) - CHUNK=256 - BS=4096 - - RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) - DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" - COUNT=$((${MEMBER_SIZE} / ${BS})) - - for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct - losetup /dev/loop${i} member-${i} - DM_PARMS+=" /dev/loop${i} 0" - done - - echo $DM_PARMS | dmsetup create raid0 - for i in $(seq 0 ${SEQ_END}); do - echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} - done; - - for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct - diff /dev/mapper/set-${i} member-${i} - done; - - for i in $(seq 0 ${SEQ_END}); do - dmsetup remove set-${i} - done - - dmsetup remove raid0 - - for i in $(seq 0 ${SEQ_END}); do - losetup -d /dev/loop${i} - rm -f member-${i} - done - -Another example ---------------- - -Intel NVMe drives contain two cores on the physical device. -Each core of the drive has segregated access to its LBA range. -The current LBA model has a RAID 0 128k chunk on each core, resulting -in a 256k stripe across the two cores:: - - Core 0: Core 1: - __________ __________ - | LBA 512| | LBA 768| - | LBA 0 | | LBA 256| - ---------- ---------- - -The purpose of this unstriping is to provide better QoS in noisy -neighbor environments. When two partitions are created on the -aggregate drive without this unstriping, reads on one partition -can affect writes on another partition. This is because the partitions -are striped across the two cores. When we unstripe this hardware RAID 0 -and make partitions on each new exposed device the two partitions are now -physically separated. - -With the dm-unstriped target we're able to segregate an fio script that -has read and write jobs that are independent of each other. Compared to -when we run the test on a combined drive with partitions, we were able -to get a 92% reduction in read latency using this device mapper target. - - -Example dmsetup usage -===================== - -unstriped ontop of Intel NVMe device that has 2 cores ------------------------------------------------------ - -:: - - dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' - dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' - -There will now be two devices that expose Intel NVMe core 0 and 1 -respectively:: - - /dev/mapper/nvmset0 - /dev/mapper/nvmset1 - -unstriped ontop of striped with 4 drives using 128K chunk size --------------------------------------------------------------- - -:: - - dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' - dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' - dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' - dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/device-mapper/verity.rst b/Documentation/device-mapper/verity.rst deleted file mode 100644 index a4d1c1476d72..000000000000 --- a/Documentation/device-mapper/verity.rst +++ /dev/null @@ -1,229 +0,0 @@ -========= -dm-verity -========= - -Device-Mapper's "verity" target provides transparent integrity checking of -block devices using a cryptographic digest provided by the kernel crypto API. -This target is read-only. - -Construction Parameters -======================= - -:: - - - - - - [<#opt_params> ] - - - This is the type of the on-disk hash format. - - 0 is the original format used in the Chromium OS. - The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeroes. - - 1 is the current format that should be used for new devices. - The salt is prepended when hashing and each digest is - padded with zeroes to the power of two. - - - This is the device containing data, the integrity of which needs to be - checked. It may be specified as a path, like /dev/sdaX, or a device number, - :. - - - This is the device that supplies the hash tree data. It may be - specified similarly to the device path and may be the same device. If the - same device is used, the hash_start should be outside the configured - dm-verity device. - - - The block size on a data device in bytes. - Each block corresponds to one digest on the hash device. - - - The size of a hash block in bytes. - - - The number of data blocks on the data device. Additional blocks are - inaccessible. You can place hashes to the same partition as data, in this - case hashes are placed after . - - - This is the offset, in -blocks, from the start of hash_dev - to the root block of the hash tree. - - - The cryptographic hash algorithm used for this device. This should - be the name of the algorithm, like "sha1". - - - The hexadecimal encoding of the cryptographic hash of the root hash block - and the salt. This hash should be trusted as there is no other authenticity - beyond this point. - - - The hexadecimal encoding of the salt value. - -<#opt_params> - Number of optional parameters. If there are no optional parameters, - the optional paramaters section can be skipped or #opt_params can be zero. - Otherwise #opt_params is the number of following arguments. - - Example of optional parameters section: - 1 ignore_corruption - -ignore_corruption - Log corrupted blocks, but allow read operations to proceed normally. - -restart_on_corruption - Restart the system when a corrupted block is discovered. This option is - not compatible with ignore_corruption and requires user space support to - avoid restart loops. - -ignore_zero_blocks - Do not verify blocks that are expected to contain zeroes and always return - zeroes instead. This may be useful if the partition contains unused blocks - that are not guaranteed to contain zeroes. - -use_fec_from_device - Use forward error correction (FEC) to recover from corruption if hash - verification fails. Use encoding data from the specified device. This - may be the same device where data and hash blocks reside, in which case - fec_start must be outside data and hash areas. - - If the encoding data covers additional metadata, it must be accessible - on the hash device after the hash blocks. - - Note: block sizes for data and hash devices must match. Also, if the - verity is encrypted the should be too. - -fec_roots - Number of generator roots. This equals to the number of parity bytes in - the encoding data. For example, in RS(M, N) encoding, the number of roots - is M-N. - -fec_blocks - The number of encoding data blocks on the FEC device. The block size for - the FEC device is . - -fec_start - This is the offset, in blocks, from the start of the - FEC device to the beginning of the encoding data. - -check_at_most_once - Verify data blocks only the first time they are read from the data device, - rather than every time. This reduces the overhead of dm-verity so that it - can be used on systems that are memory and/or CPU constrained. However, it - provides a reduced level of security because only offline tampering of the - data device's content will be detected, not online tampering. - - Hash blocks are still verified each time they are read from the hash device, - since verification of hash blocks is less performance critical than data - blocks, and a hash block will not be verified any more after all the data - blocks it covers have been verified anyway. - -Theory of operation -=================== - -dm-verity is meant to be set up as part of a verified boot path. This -may be anything ranging from a boot using tboot or trustedgrub to just -booting from a known-good device (like a USB drive or CD). - -When a dm-verity device is configured, it is expected that the caller -has been authenticated in some way (cryptographic signatures, etc). -After instantiation, all hashes will be verified on-demand during -disk access. If they cannot be verified up to the root node of the -tree, the root hash, then the I/O will fail. This should detect -tampering with any data on the device and the hash data. - -Cryptographic hashes are used to assert the integrity of the device on a -per-block basis. This allows for a lightweight hash computation on first read -into the page cache. Block hashes are stored linearly, aligned to the nearest -block size. - -If forward error correction (FEC) support is enabled any recovery of -corrupted data will be verified using the cryptographic hash of the -corresponding data. This is why combining error correction with -integrity checking is essential. - -Hash Tree ---------- - -Each node in the tree is a cryptographic hash. If it is a leaf node, the hash -of some data block on disk is calculated. If it is an intermediary node, -the hash of a number of child nodes is calculated. - -Each entry in the tree is a collection of neighboring nodes that fit in one -block. The number is determined based on block_size and the size of the -selected cryptographic digest algorithm. The hashes are linearly-ordered in -this entry and any unaligned trailing space is ignored but included when -calculating the parent node. - -The tree looks something like: - - alg = sha256, num_blocks = 32768, block_size = 4096 - -:: - - [ root ] - / . . . \ - [entry_0] [entry_1] - / . . . \ . . . \ - [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] - / ... \ / . . . \ / \ - blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 - - -On-disk format -============== - -The verity kernel code does not read the verity metadata on-disk header. -It only reads the hash blocks which directly follow the header. -It is expected that a user-space tool will verify the integrity of the -verity header. - -Alternatively, the header can be omitted and the dmsetup parameters can -be passed via the kernel command-line in a rooted chain of trust where -the command-line is verified. - -Directly following the header (and with sector number padded to the next hash -block boundary) are the hash blocks which are stored a depth at a time -(starting from the root), sorted in order of increasing index. - -The full specification of kernel parameters and on-disk metadata format -is available at the cryptsetup project's wiki page - - https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity - -Status -====== -V (for Valid) is returned if every check performed so far was valid. -If any check failed, C (for Corruption) is returned. - -Example -======= -Set up a device:: - - # dmsetup create vroot --readonly --table \ - "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ - "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ - "1234000000000000000000000000000000000000000000000000000000000000" - -A command line tool veritysetup is available to compute or verify -the hash tree or activate the kernel device. This is available from -the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ -(as a libcryptsetup extension). - -Create hash on the device:: - - # veritysetup format /dev/sda1 /dev/sda2 - ... - Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 - -Activate the device:: - - # veritysetup create vroot /dev/sda1 /dev/sda2 \ - 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/device-mapper/writecache.rst b/Documentation/device-mapper/writecache.rst deleted file mode 100644 index d3d7690f5e8d..000000000000 --- a/Documentation/device-mapper/writecache.rst +++ /dev/null @@ -1,79 +0,0 @@ -================= -Writecache target -================= - -The writecache target caches writes on persistent memory or on SSD. It -doesn't cache reads because reads are supposed to be cached in page cache -in normal RAM. - -When the device is constructed, the first sector should be zeroed or the -first sector should contain valid superblock from previous invocation. - -Constructor parameters: - -1. type of the cache device - "p" or "s" - - - p - persistent memory - - s - SSD -2. the underlying device that will be cached -3. the cache device -4. block size (4096 is recommended; the maximum block size is the page - size) -5. the number of optional parameters (the parameters with an argument - count as two) - - start_sector n (default: 0) - offset from the start of cache device in 512-byte sectors - high_watermark n (default: 50) - start writeback when the number of used blocks reach this - watermark - low_watermark x (default: 45) - stop writeback when the number of used blocks drops below - this watermark - writeback_jobs n (default: unlimited) - limit the number of blocks that are in flight during - writeback. Setting this value reduces writeback - throughput, but it may improve latency of read requests - autocommit_blocks n (default: 64 for pmem, 65536 for ssd) - when the application writes this amount of blocks without - issuing the FLUSH request, the blocks are automatically - commited - autocommit_time ms (default: 1000) - autocommit time in milliseconds. The data is automatically - commited if this time passes and no FLUSH request is - received - fua (by default on) - applicable only to persistent memory - use the FUA flag - when writing data from persistent memory back to the - underlying device - nofua - applicable only to persistent memory - don't use the FUA - flag when writing back data and send the FLUSH request - afterwards - - - some underlying devices perform better with fua, some - with nofua. The user should test it - -Status: -1. error indicator - 0 if there was no error, otherwise error number -2. the number of blocks -3. the number of free blocks -4. the number of blocks under writeback - -Messages: - flush - flush the cache device. The message returns successfully - if the cache device was flushed without an error - flush_on_suspend - flush the cache device on next suspend. Use this message - when you are going to remove the cache device. The proper - sequence for removing the cache device is: - - 1. send the "flush_on_suspend" message - 2. load an inactive table with a linear target that maps - to the underlying device - 3. suspend the device - 4. ask for status and verify that there are no errors - 5. resume the device, so that it will use the linear - target - 6. the cache device is now inactive and it can be deleted diff --git a/Documentation/device-mapper/zero.rst b/Documentation/device-mapper/zero.rst deleted file mode 100644 index 11fb5cf4597c..000000000000 --- a/Documentation/device-mapper/zero.rst +++ /dev/null @@ -1,37 +0,0 @@ -======= -dm-zero -======= - -Device-Mapper's "zero" target provides a block-device that always returns -zero'd data on reads and silently drops writes. This is similar behavior to -/dev/zero, but as a block-device instead of a character-device. - -Dm-zero has no target-specific parameters. - -One very interesting use of dm-zero is for creating "sparse" devices in -conjunction with dm-snapshot. A sparse device reports a device-size larger -than the amount of actual storage space available for that device. A user can -write data anywhere within the sparse device and read it back like a normal -device. Reads to previously unwritten areas will return a zero'd buffer. When -enough data has been written to fill up the actual storage space, the sparse -device is deactivated. This can be very useful for testing device and -filesystem limitations. - -To create a sparse device, start by creating a dm-zero device that's the -desired size of the sparse device. For this example, we'll assume a 10TB -sparse device:: - - TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors - echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 - -Then create a snapshot of the zero device, using any available block-device as -the COW device. The size of the COW device will determine the amount of real -space available to the sparse device. For this example, we'll assume /dev/sdb1 -is an available 10GB partition:: - - echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ - dmsetup create sparse1 - -This will create a 10TB sparse device called /dev/mapper/sparse1 that has -10GB of actual storage space available. If more than 10GB of data is written -to this device, it will start returning I/O errors. -- cgit From ec4b78a0e7dd4751423089b7cfd32168f9052377 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 15:00:25 -0300 Subject: docs: early-userspace: move to driver-api guide Those documents describe a kAPI. So, add to the driver-api book. Signed-off-by: Mauro Carvalho Chehab --- .../driver-api/early-userspace/buffer-format.rst | 119 ++++++++++++++++ .../early-userspace/early_userspace_support.rst | 154 +++++++++++++++++++++ Documentation/driver-api/early-userspace/index.rst | 16 +++ Documentation/driver-api/index.rst | 1 + Documentation/early-userspace/buffer-format.rst | 119 ---------------- .../early-userspace/early_userspace_support.rst | 154 --------------------- Documentation/early-userspace/index.rst | 18 --- Documentation/filesystems/nfs/nfsroot.txt | 2 +- .../filesystems/ramfs-rootfs-initramfs.txt | 4 +- 9 files changed, 293 insertions(+), 294 deletions(-) create mode 100644 Documentation/driver-api/early-userspace/buffer-format.rst create mode 100644 Documentation/driver-api/early-userspace/early_userspace_support.rst create mode 100644 Documentation/driver-api/early-userspace/index.rst delete mode 100644 Documentation/early-userspace/buffer-format.rst delete mode 100644 Documentation/early-userspace/early_userspace_support.rst delete mode 100644 Documentation/early-userspace/index.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/early-userspace/buffer-format.rst b/Documentation/driver-api/early-userspace/buffer-format.rst new file mode 100644 index 000000000000..7f74e301fdf3 --- /dev/null +++ b/Documentation/driver-api/early-userspace/buffer-format.rst @@ -0,0 +1,119 @@ +======================= +initramfs buffer format +======================= + +Al Viro, H. Peter Anvin + +Last revision: 2002-01-13 + +Starting with kernel 2.5.x, the old "initial ramdisk" protocol is +getting {replaced/complemented} with the new "initial ramfs" +(initramfs) protocol. The initramfs contents is passed using the same +memory buffer protocol used by the initrd protocol, but the contents +is different. The initramfs buffer contains an archive which is +expanded into a ramfs filesystem; this document details the format of +the initramfs buffer format. + +The initramfs buffer format is based around the "newc" or "crc" CPIO +formats, and can be created with the cpio(1) utility. The cpio +archive can be compressed using gzip(1). One valid version of an +initramfs buffer is thus a single .cpio.gz file. + +The full format of the initramfs buffer is defined by the following +grammar, where:: + + * is used to indicate "0 or more occurrences of" + (|) indicates alternatives + + indicates concatenation + GZIP() indicates the gzip(1) of the operand + ALGN(n) means padding with null bytes to an n-byte boundary + + initramfs := ("\0" | cpio_archive | cpio_gzip_archive)* + + cpio_gzip_archive := GZIP(cpio_archive) + + cpio_archive := cpio_file* + ( | cpio_trailer) + + cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data + + cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4) + + +In human terms, the initramfs buffer contains a collection of +compressed and/or uncompressed cpio archives (in the "newc" or "crc" +formats); arbitrary amounts zero bytes (for padding) can be added +between members. + +The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is +not ignored; see "handling of hard links" below. + +The structure of the cpio_header is as follows (all fields contain +hexadecimal ASCII numbers fully padded with '0' on the left to the +full width of the field, for example, the integer 4780 is represented +by the ASCII string "000012ac"): + +============= ================== ============================================== +Field name Field size Meaning +============= ================== ============================================== +c_magic 6 bytes The string "070701" or "070702" +c_ino 8 bytes File inode number +c_mode 8 bytes File mode and permissions +c_uid 8 bytes File uid +c_gid 8 bytes File gid +c_nlink 8 bytes Number of links +c_mtime 8 bytes Modification time +c_filesize 8 bytes Size of data field +c_maj 8 bytes Major part of file device number +c_min 8 bytes Minor part of file device number +c_rmaj 8 bytes Major part of device node reference +c_rmin 8 bytes Minor part of device node reference +c_namesize 8 bytes Length of filename, including final \0 +c_chksum 8 bytes Checksum of data field if c_magic is 070702; + otherwise zero +============= ================== ============================================== + +The c_mode field matches the contents of st_mode returned by stat(2) +on Linux, and encodes the file type and file permissions. + +The c_filesize should be zero for any file which is not a regular file +or symlink. + +The c_chksum field contains a simple 32-bit unsigned sum of all the +bytes in the data field. cpio(1) refers to this as "crc", which is +clearly incorrect (a cyclic redundancy check is a different and +significantly stronger integrity check), however, this is the +algorithm used. + +If the filename is "TRAILER!!!" this is actually an end-of-archive +marker; the c_filesize for an end-of-archive marker must be zero. + + +Handling of hard links +====================== + +When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino) +tuple is looked up in a tuple buffer. If not found, it is entered in +the tuple buffer and the entry is created as usual; if found, a hard +link rather than a second copy of the file is created. It is not +necessary (but permitted) to include a second copy of the file +contents; if the file contents is not included, the c_filesize field +should be set to zero to indicate no data section follows. If data is +present, the previous instance of the file is overwritten; this allows +the data-carrying instance of a file to occur anywhere in the sequence +(GNU cpio is reported to attach the data to the last instance of a +file only.) + +c_filesize must not be zero for a symlink. + +When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is +reset. This permits archives which are generated independently to be +concatenated. + +To combine file data from different sources (without having to +regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of +the following techniques can be used: + +a) Separate the different file data sources with a "TRAILER!!!" + end-of-archive marker, or + +b) Make sure c_nlink == 1 for all nondirectory entries. diff --git a/Documentation/driver-api/early-userspace/early_userspace_support.rst b/Documentation/driver-api/early-userspace/early_userspace_support.rst new file mode 100644 index 000000000000..3deefb34046b --- /dev/null +++ b/Documentation/driver-api/early-userspace/early_userspace_support.rst @@ -0,0 +1,154 @@ +======================= +Early userspace support +======================= + +Last update: 2004-12-20 tlh + + +"Early userspace" is a set of libraries and programs that provide +various pieces of functionality that are important enough to be +available while a Linux kernel is coming up, but that don't need to be +run inside the kernel itself. + +It consists of several major infrastructure components: + +- gen_init_cpio, a program that builds a cpio-format archive + containing a root filesystem image. This archive is compressed, and + the compressed image is linked into the kernel image. +- initramfs, a chunk of code that unpacks the compressed cpio image + midway through the kernel boot process. +- klibc, a userspace C library, currently packaged separately, that is + optimized for correctness and small size. + +The cpio file format used by initramfs is the "newc" (aka "cpio -H newc") +format, and is documented in the file "buffer-format.txt". There are +two ways to add an early userspace image: specify an existing cpio +archive to be used as the image or have the kernel build process build +the image from specifications. + +CPIO ARCHIVE method +------------------- + +You can create a cpio archive that contains the early userspace image. +Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it +will be used directly. Only a single cpio file may be specified in +CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in +combination with a cpio archive. + +IMAGE BUILDING method +--------------------- + +The kernel build process can also build an early userspace image from +source parts rather than supplying a cpio archive. This method provides +a way to create images with root-owned files even though the image was +built by an unprivileged user. + +The image is specified as one or more sources in +CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files - +cpio archives are *not* allowed when building from sources. + +A source directory will have it and all of its contents packaged. The +specified directory name will be mapped to '/'. When packaging a +directory, limited user and group ID translation can be performed. +INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to +user root (0). INITRAMFS_ROOT_GID can be set to a group ID that needs +to be mapped to group root (0). + +A source file must be directives in the format required by the +usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the +file format). The directives in the file will be passed directly to +usr/gen_init_cpio. + +When a combination of directories and files are specified then the +initramfs image will be an aggregate of all of them. In this way a user +can create a 'root-image' directory and install all files into it. +Because device-special files cannot be created by a unprivileged user, +special files can be listed in a 'root-files' file. Both 'root-image' +and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete +early userspace image can be built by an unprivileged user. + +As a technical note, when directories and files are specified, the +entire CONFIG_INITRAMFS_SOURCE is passed to +usr/gen_initramfs_list.sh. This means that CONFIG_INITRAMFS_SOURCE +can really be interpreted as any legal argument to +gen_initramfs_list.sh. If a directory is specified as an argument then +the contents are scanned, uid/gid translation is performed, and +usr/gen_init_cpio file directives are output. If a directory is +specified as an argument to usr/gen_initramfs_list.sh then the +contents of the file are simply copied to the output. All of the output +directives from directory scanning and file contents copying are +processed by usr/gen_init_cpio. + +See also 'usr/gen_initramfs_list.sh -h'. + +Where's this all leading? +========================= + +The klibc distribution contains some of the necessary software to make +early userspace useful. The klibc distribution is currently +maintained separately from the kernel. + +You can obtain somewhat infrequent snapshots of klibc from +https://www.kernel.org/pub/linux/libs/klibc/ + +For active users, you are better off using the klibc git +repository, at http://git.kernel.org/?p=libs/klibc/klibc.git + +The standalone klibc distribution currently provides three components, +in addition to the klibc library: + +- ipconfig, a program that configures network interfaces. It can + configure them statically, or use DHCP to obtain information + dynamically (aka "IP autoconfiguration"). +- nfsmount, a program that can mount an NFS filesystem. +- kinit, the "glue" that uses ipconfig and nfsmount to replace the old + support for IP autoconfig, mount a filesystem over NFS, and continue + system boot using that filesystem as root. + +kinit is built as a single statically linked binary to save space. + +Eventually, several more chunks of kernel functionality will hopefully +move to early userspace: + +- Almost all of init/do_mounts* (the beginning of this is already in + place) +- ACPI table parsing +- Insert unwieldy subsystem that doesn't really need to be in kernel + space here + +If kinit doesn't meet your current needs and you've got bytes to burn, +the klibc distribution includes a small Bourne-compatible shell (ash) +and a number of other utilities, so you can replace kinit and build +custom initramfs images that meet your needs exactly. + +For questions and help, you can sign up for the early userspace +mailing list at http://www.zytor.com/mailman/listinfo/klibc + +How does it work? +================= + +The kernel has currently 3 ways to mount the root filesystem: + +a) all required device and filesystem drivers compiled into the kernel, no + initrd. init/main.c:init() will call prepare_namespace() to mount the + final root filesystem, based on the root= option and optional init= to run + some other init binary than listed at the end of init/main.c:init(). + +b) some device and filesystem drivers built as modules and stored in an + initrd. The initrd must contain a binary '/linuxrc' which is supposed to + load these driver modules. It is also possible to mount the final root + filesystem via linuxrc and use the pivot_root syscall. The initrd is + mounted and executed via prepare_namespace(). + +c) using initramfs. The call to prepare_namespace() must be skipped. + This means that a binary must do all the work. Said binary can be stored + into initramfs either via modifying usr/gen_init_cpio.c or via the new + initrd format, an cpio archive. It must be called "/init". This binary + is responsible to do all the things prepare_namespace() would do. + + To maintain backwards compatibility, the /init binary will only run if it + comes via an initramfs cpio archive. If this is not the case, + init/main.c:init() will run prepare_namespace() to mount the final root + and exec one of the predefined init binaries. + +Bryan O'Sullivan diff --git a/Documentation/driver-api/early-userspace/index.rst b/Documentation/driver-api/early-userspace/index.rst new file mode 100644 index 000000000000..6f20c3c560d8 --- /dev/null +++ b/Documentation/driver-api/early-userspace/index.rst @@ -0,0 +1,16 @@ +=============== +Early Userspace +=============== + +.. toctree:: + :maxdepth: 1 + + early_userspace_support + buffer-format + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index baa77a666e46..0f281f4f648f 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -16,6 +16,7 @@ available subsections can be seen below. basics infrastructure + early-userspace/index pm/index clk device-io diff --git a/Documentation/early-userspace/buffer-format.rst b/Documentation/early-userspace/buffer-format.rst deleted file mode 100644 index 7f74e301fdf3..000000000000 --- a/Documentation/early-userspace/buffer-format.rst +++ /dev/null @@ -1,119 +0,0 @@ -======================= -initramfs buffer format -======================= - -Al Viro, H. Peter Anvin - -Last revision: 2002-01-13 - -Starting with kernel 2.5.x, the old "initial ramdisk" protocol is -getting {replaced/complemented} with the new "initial ramfs" -(initramfs) protocol. The initramfs contents is passed using the same -memory buffer protocol used by the initrd protocol, but the contents -is different. The initramfs buffer contains an archive which is -expanded into a ramfs filesystem; this document details the format of -the initramfs buffer format. - -The initramfs buffer format is based around the "newc" or "crc" CPIO -formats, and can be created with the cpio(1) utility. The cpio -archive can be compressed using gzip(1). One valid version of an -initramfs buffer is thus a single .cpio.gz file. - -The full format of the initramfs buffer is defined by the following -grammar, where:: - - * is used to indicate "0 or more occurrences of" - (|) indicates alternatives - + indicates concatenation - GZIP() indicates the gzip(1) of the operand - ALGN(n) means padding with null bytes to an n-byte boundary - - initramfs := ("\0" | cpio_archive | cpio_gzip_archive)* - - cpio_gzip_archive := GZIP(cpio_archive) - - cpio_archive := cpio_file* + ( | cpio_trailer) - - cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data - - cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4) - - -In human terms, the initramfs buffer contains a collection of -compressed and/or uncompressed cpio archives (in the "newc" or "crc" -formats); arbitrary amounts zero bytes (for padding) can be added -between members. - -The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is -not ignored; see "handling of hard links" below. - -The structure of the cpio_header is as follows (all fields contain -hexadecimal ASCII numbers fully padded with '0' on the left to the -full width of the field, for example, the integer 4780 is represented -by the ASCII string "000012ac"): - -============= ================== ============================================== -Field name Field size Meaning -============= ================== ============================================== -c_magic 6 bytes The string "070701" or "070702" -c_ino 8 bytes File inode number -c_mode 8 bytes File mode and permissions -c_uid 8 bytes File uid -c_gid 8 bytes File gid -c_nlink 8 bytes Number of links -c_mtime 8 bytes Modification time -c_filesize 8 bytes Size of data field -c_maj 8 bytes Major part of file device number -c_min 8 bytes Minor part of file device number -c_rmaj 8 bytes Major part of device node reference -c_rmin 8 bytes Minor part of device node reference -c_namesize 8 bytes Length of filename, including final \0 -c_chksum 8 bytes Checksum of data field if c_magic is 070702; - otherwise zero -============= ================== ============================================== - -The c_mode field matches the contents of st_mode returned by stat(2) -on Linux, and encodes the file type and file permissions. - -The c_filesize should be zero for any file which is not a regular file -or symlink. - -The c_chksum field contains a simple 32-bit unsigned sum of all the -bytes in the data field. cpio(1) refers to this as "crc", which is -clearly incorrect (a cyclic redundancy check is a different and -significantly stronger integrity check), however, this is the -algorithm used. - -If the filename is "TRAILER!!!" this is actually an end-of-archive -marker; the c_filesize for an end-of-archive marker must be zero. - - -Handling of hard links -====================== - -When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino) -tuple is looked up in a tuple buffer. If not found, it is entered in -the tuple buffer and the entry is created as usual; if found, a hard -link rather than a second copy of the file is created. It is not -necessary (but permitted) to include a second copy of the file -contents; if the file contents is not included, the c_filesize field -should be set to zero to indicate no data section follows. If data is -present, the previous instance of the file is overwritten; this allows -the data-carrying instance of a file to occur anywhere in the sequence -(GNU cpio is reported to attach the data to the last instance of a -file only.) - -c_filesize must not be zero for a symlink. - -When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is -reset. This permits archives which are generated independently to be -concatenated. - -To combine file data from different sources (without having to -regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of -the following techniques can be used: - -a) Separate the different file data sources with a "TRAILER!!!" - end-of-archive marker, or - -b) Make sure c_nlink == 1 for all nondirectory entries. diff --git a/Documentation/early-userspace/early_userspace_support.rst b/Documentation/early-userspace/early_userspace_support.rst deleted file mode 100644 index 3deefb34046b..000000000000 --- a/Documentation/early-userspace/early_userspace_support.rst +++ /dev/null @@ -1,154 +0,0 @@ -======================= -Early userspace support -======================= - -Last update: 2004-12-20 tlh - - -"Early userspace" is a set of libraries and programs that provide -various pieces of functionality that are important enough to be -available while a Linux kernel is coming up, but that don't need to be -run inside the kernel itself. - -It consists of several major infrastructure components: - -- gen_init_cpio, a program that builds a cpio-format archive - containing a root filesystem image. This archive is compressed, and - the compressed image is linked into the kernel image. -- initramfs, a chunk of code that unpacks the compressed cpio image - midway through the kernel boot process. -- klibc, a userspace C library, currently packaged separately, that is - optimized for correctness and small size. - -The cpio file format used by initramfs is the "newc" (aka "cpio -H newc") -format, and is documented in the file "buffer-format.txt". There are -two ways to add an early userspace image: specify an existing cpio -archive to be used as the image or have the kernel build process build -the image from specifications. - -CPIO ARCHIVE method -------------------- - -You can create a cpio archive that contains the early userspace image. -Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it -will be used directly. Only a single cpio file may be specified in -CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in -combination with a cpio archive. - -IMAGE BUILDING method ---------------------- - -The kernel build process can also build an early userspace image from -source parts rather than supplying a cpio archive. This method provides -a way to create images with root-owned files even though the image was -built by an unprivileged user. - -The image is specified as one or more sources in -CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files - -cpio archives are *not* allowed when building from sources. - -A source directory will have it and all of its contents packaged. The -specified directory name will be mapped to '/'. When packaging a -directory, limited user and group ID translation can be performed. -INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to -user root (0). INITRAMFS_ROOT_GID can be set to a group ID that needs -to be mapped to group root (0). - -A source file must be directives in the format required by the -usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the -file format). The directives in the file will be passed directly to -usr/gen_init_cpio. - -When a combination of directories and files are specified then the -initramfs image will be an aggregate of all of them. In this way a user -can create a 'root-image' directory and install all files into it. -Because device-special files cannot be created by a unprivileged user, -special files can be listed in a 'root-files' file. Both 'root-image' -and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete -early userspace image can be built by an unprivileged user. - -As a technical note, when directories and files are specified, the -entire CONFIG_INITRAMFS_SOURCE is passed to -usr/gen_initramfs_list.sh. This means that CONFIG_INITRAMFS_SOURCE -can really be interpreted as any legal argument to -gen_initramfs_list.sh. If a directory is specified as an argument then -the contents are scanned, uid/gid translation is performed, and -usr/gen_init_cpio file directives are output. If a directory is -specified as an argument to usr/gen_initramfs_list.sh then the -contents of the file are simply copied to the output. All of the output -directives from directory scanning and file contents copying are -processed by usr/gen_init_cpio. - -See also 'usr/gen_initramfs_list.sh -h'. - -Where's this all leading? -========================= - -The klibc distribution contains some of the necessary software to make -early userspace useful. The klibc distribution is currently -maintained separately from the kernel. - -You can obtain somewhat infrequent snapshots of klibc from -https://www.kernel.org/pub/linux/libs/klibc/ - -For active users, you are better off using the klibc git -repository, at http://git.kernel.org/?p=libs/klibc/klibc.git - -The standalone klibc distribution currently provides three components, -in addition to the klibc library: - -- ipconfig, a program that configures network interfaces. It can - configure them statically, or use DHCP to obtain information - dynamically (aka "IP autoconfiguration"). -- nfsmount, a program that can mount an NFS filesystem. -- kinit, the "glue" that uses ipconfig and nfsmount to replace the old - support for IP autoconfig, mount a filesystem over NFS, and continue - system boot using that filesystem as root. - -kinit is built as a single statically linked binary to save space. - -Eventually, several more chunks of kernel functionality will hopefully -move to early userspace: - -- Almost all of init/do_mounts* (the beginning of this is already in - place) -- ACPI table parsing -- Insert unwieldy subsystem that doesn't really need to be in kernel - space here - -If kinit doesn't meet your current needs and you've got bytes to burn, -the klibc distribution includes a small Bourne-compatible shell (ash) -and a number of other utilities, so you can replace kinit and build -custom initramfs images that meet your needs exactly. - -For questions and help, you can sign up for the early userspace -mailing list at http://www.zytor.com/mailman/listinfo/klibc - -How does it work? -================= - -The kernel has currently 3 ways to mount the root filesystem: - -a) all required device and filesystem drivers compiled into the kernel, no - initrd. init/main.c:init() will call prepare_namespace() to mount the - final root filesystem, based on the root= option and optional init= to run - some other init binary than listed at the end of init/main.c:init(). - -b) some device and filesystem drivers built as modules and stored in an - initrd. The initrd must contain a binary '/linuxrc' which is supposed to - load these driver modules. It is also possible to mount the final root - filesystem via linuxrc and use the pivot_root syscall. The initrd is - mounted and executed via prepare_namespace(). - -c) using initramfs. The call to prepare_namespace() must be skipped. - This means that a binary must do all the work. Said binary can be stored - into initramfs either via modifying usr/gen_init_cpio.c or via the new - initrd format, an cpio archive. It must be called "/init". This binary - is responsible to do all the things prepare_namespace() would do. - - To maintain backwards compatibility, the /init binary will only run if it - comes via an initramfs cpio archive. If this is not the case, - init/main.c:init() will run prepare_namespace() to mount the final root - and exec one of the predefined init binaries. - -Bryan O'Sullivan diff --git a/Documentation/early-userspace/index.rst b/Documentation/early-userspace/index.rst deleted file mode 100644 index 2b8eb6132058..000000000000 --- a/Documentation/early-userspace/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -:orphan: - -=============== -Early Userspace -=============== - -.. toctree:: - :maxdepth: 1 - - early_userspace_support - buffer-format - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index 4862d3d77e27..ae4332464560 100644 --- a/Documentation/filesystems/nfs/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt @@ -239,7 +239,7 @@ rdinit= A description of the process of mounting the root file system can be found in: - Documentation/early-userspace/early_userspace_support.rst + Documentation/driver-api/early-userspace/early_userspace_support.rst diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index fa985909dbca..97d42ccaa92d 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt @@ -105,7 +105,7 @@ All this differs from the old initrd in several ways: - The old initrd file was a gzipped filesystem image (in some file format, such as ext2, that needed a driver built into the kernel), while the new initramfs archive is a gzipped cpio archive (like tar only simpler, - see cpio(1) and Documentation/early-userspace/buffer-format.rst). The + see cpio(1) and Documentation/driver-api/early-userspace/buffer-format.rst). The kernel's cpio extraction code is not only extremely small, it's also __init text and data that can be discarded during the boot process. @@ -159,7 +159,7 @@ One advantage of the configuration file is that root access is not required to set permissions or create device nodes in the new archive. (Note that those two example "file" entries expect to find files named "init.sh" and "busybox" in a directory called "initramfs", under the linux-2.6.* directory. See -Documentation/early-userspace/early_userspace_support.rst for more details.) +Documentation/driver-api/early-userspace/early_userspace_support.rst for more details.) The kernel does not depend on external cpio tools. If you specify a directory instead of a configuration file, the kernel's build infrastructure -- cgit From 570432470275c3da15b85362bc1461945b9c1919 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Apr 2019 16:48:00 -0300 Subject: docs: admin-guide: move sysctl directory to it The stuff under sysctl describes /sys interface from userspace point of view. So, add it to the admin-guide and remove the :orphan: from its index file. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/mm/index.rst | 2 +- Documentation/admin-guide/mm/ksm.rst | 2 +- Documentation/admin-guide/sysctl/abi.rst | 67 ++ Documentation/admin-guide/sysctl/fs.rst | 384 ++++++++ Documentation/admin-guide/sysctl/index.rst | 98 ++ Documentation/admin-guide/sysctl/kernel.rst | 1177 +++++++++++++++++++++++ Documentation/admin-guide/sysctl/net.rst | 461 +++++++++ Documentation/admin-guide/sysctl/sunrpc.rst | 25 + Documentation/admin-guide/sysctl/user.rst | 78 ++ Documentation/admin-guide/sysctl/vm.rst | 964 +++++++++++++++++++ Documentation/core-api/printk-formats.rst | 2 +- Documentation/filesystems/proc.txt | 2 +- Documentation/networking/ip-sysctl.txt | 2 +- Documentation/sysctl/abi.rst | 67 -- Documentation/sysctl/fs.rst | 384 -------- Documentation/sysctl/index.rst | 100 -- Documentation/sysctl/kernel.rst | 1177 ----------------------- Documentation/sysctl/net.rst | 461 --------- Documentation/sysctl/sunrpc.rst | 25 - Documentation/sysctl/user.rst | 78 -- Documentation/sysctl/vm.rst | 964 ------------------- Documentation/vm/unevictable-lru.rst | 2 +- 24 files changed, 3262 insertions(+), 3263 deletions(-) create mode 100644 Documentation/admin-guide/sysctl/abi.rst create mode 100644 Documentation/admin-guide/sysctl/fs.rst create mode 100644 Documentation/admin-guide/sysctl/index.rst create mode 100644 Documentation/admin-guide/sysctl/kernel.rst create mode 100644 Documentation/admin-guide/sysctl/net.rst create mode 100644 Documentation/admin-guide/sysctl/sunrpc.rst create mode 100644 Documentation/admin-guide/sysctl/user.rst create mode 100644 Documentation/admin-guide/sysctl/vm.rst delete mode 100644 Documentation/sysctl/abi.rst delete mode 100644 Documentation/sysctl/fs.rst delete mode 100644 Documentation/sysctl/index.rst delete mode 100644 Documentation/sysctl/kernel.rst delete mode 100644 Documentation/sysctl/net.rst delete mode 100644 Documentation/sysctl/sunrpc.rst delete mode 100644 Documentation/sysctl/user.rst delete mode 100644 Documentation/sysctl/vm.rst (limited to 'Documentation') diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 64e97a969857..5c6ae1ccee1a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -16,6 +16,7 @@ etc. README kernel-parameters devices + sysctl/index This section describes CPU vulnerabilities and their mitigations. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e8e28cac32a3..b323f5d4366a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3144,7 +3144,7 @@ numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 'node', 'default' can be specified This can be set from sysctl after boot. - See Documentation/sysctl/vm.rst for details. + See Documentation/admin-guide/sysctl/vm.rst for details. ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. See Documentation/debugging-via-ohci1394.txt for more diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index f5e92f33f96e..5f61a6c429e0 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -11,7 +11,7 @@ processes address space and many other cool things. Linux memory management is a complex system with many configurable settings. Most of these settings are available via ``/proc`` filesystem and can be quired and adjusted using ``sysctl``. These APIs -are described in Documentation/sysctl/vm.rst and in `man 5 proc`_. +are described in Documentation/admin-guide/sysctl/vm.rst and in `man 5 proc`_. .. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 7b2b8767c0b4..874eb0c77d34 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -59,7 +59,7 @@ MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. If a region of memory must be split into at least one new MADV_MERGEABLE or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process -will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.rst). +will exceed ``vm.max_map_count`` (see Documentation/admin-guide/sysctl/vm.rst). Like other madvise calls, they are intended for use on mapped areas of the user address space: they will report ENOMEM if the specified range diff --git a/Documentation/admin-guide/sysctl/abi.rst b/Documentation/admin-guide/sysctl/abi.rst new file mode 100644 index 000000000000..599bcde7f0b7 --- /dev/null +++ b/Documentation/admin-guide/sysctl/abi.rst @@ -0,0 +1,67 @@ +================================ +Documentation for /proc/sys/abi/ +================================ + +kernel version 2.6.0.test2 + +Copyright (c) 2003, Fabian Frederick + +For general info: index.rst. + +------------------------------------------------------------------------------ + +This path is binary emulation relevant aka personality types aka abi. +When a process is executed, it's linked to an exec_domain whose +personality is defined using values available from /proc/sys/abi. +You can find further details about abi in include/linux/personality.h. + +Here are the files featuring in 2.6 kernel: + +- defhandler_coff +- defhandler_elf +- defhandler_lcall7 +- defhandler_libcso +- fake_utsname +- trace + +defhandler_coff +--------------- + +defined value: + PER_SCOSVR3:: + + 0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE + +defhandler_elf +-------------- + +defined value: + PER_LINUX:: + + 0 + +defhandler_lcall7 +----------------- + +defined value : + PER_SVR4:: + + 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO, + +defhandler_libsco +----------------- + +defined value: + PER_SVR4:: + + 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO, + +fake_utsname +------------ + +Unused + +trace +----- + +Unused diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst new file mode 100644 index 000000000000..2a45119e3331 --- /dev/null +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -0,0 +1,384 @@ +=============================== +Documentation for /proc/sys/fs/ +=============================== + +kernel version 2.2.10 + +Copyright (c) 1998, 1999, Rik van Riel + +Copyright (c) 2009, Shen Feng + +For general info and legal blurb, please look in intro.rst. + +------------------------------------------------------------------------------ + +This file contains documentation for the sysctl files in +/proc/sys/fs/ and is valid for Linux kernel version 2.2. + +The files in this directory can be used to tune and monitor +miscellaneous and general things in the operation of the Linux +kernel. Since some of the files _can_ be used to screw up your +system, it is advisable to read both documentation and source +before actually making adjustments. + +1. /proc/sys/fs +=============== + +Currently, these files are in /proc/sys/fs: + +- aio-max-nr +- aio-nr +- dentry-state +- dquot-max +- dquot-nr +- file-max +- file-nr +- inode-max +- inode-nr +- inode-state +- nr_open +- overflowuid +- overflowgid +- pipe-user-pages-hard +- pipe-user-pages-soft +- protected_fifos +- protected_hardlinks +- protected_regular +- protected_symlinks +- suid_dumpable +- super-max +- super-nr + + +aio-nr & aio-max-nr +------------------- + +aio-nr is the running total of the number of events specified on the +io_setup system call for all currently active aio contexts. If aio-nr +reaches aio-max-nr then io_setup will fail with EAGAIN. Note that +raising aio-max-nr does not result in the pre-allocation or re-sizing +of any kernel data structures. + + +dentry-state +------------ + +From linux/include/linux/dcache.h:: + + struct dentry_stat_t dentry_stat { + int nr_dentry; + int nr_unused; + int age_limit; /* age in seconds */ + int want_pages; /* pages requested by system */ + int nr_negative; /* # of unused negative dentries */ + int dummy; /* Reserved for future use */ + }; + +Dentries are dynamically allocated and deallocated. + +nr_dentry shows the total number of dentries allocated (active ++ unused). nr_unused shows the number of dentries that are not +actively used, but are saved in the LRU list for future reuse. + +Age_limit is the age in seconds after which dcache entries +can be reclaimed when memory is short and want_pages is +nonzero when shrink_dcache_pages() has been called and the +dcache isn't pruned yet. + +nr_negative shows the number of unused dentries that are also +negative dentries which do not map to any files. Instead, +they help speeding up rejection of non-existing files provided +by the users. + + +dquot-max & dquot-nr +-------------------- + +The file dquot-max shows the maximum number of cached disk +quota entries. + +The file dquot-nr shows the number of allocated disk quota +entries and the number of free disk quota entries. + +If the number of free cached disk quotas is very low and +you have some awesome number of simultaneous system users, +you might want to raise the limit. + + +file-max & file-nr +------------------ + +The value in file-max denotes the maximum number of file- +handles that the Linux kernel will allocate. When you get lots +of error messages about running out of file handles, you might +want to increase this limit. + +Historically,the kernel was able to allocate file handles +dynamically, but not to free them again. The three values in +file-nr denote the number of allocated file handles, the number +of allocated but unused file handles, and the maximum number of +file handles. Linux 2.6 always reports 0 as the number of free +file handles -- this is not an error, it just means that the +number of allocated file handles exactly matches the number of +used file handles. + +Attempts to allocate more file descriptors than file-max are +reported with printk, look for "VFS: file-max limit +reached". + + +nr_open +------- + +This denotes the maximum number of file-handles a process can +allocate. Default value is 1024*1024 (1048576) which should be +enough for most machines. Actual limit depends on RLIMIT_NOFILE +resource limit. + + +inode-max, inode-nr & inode-state +--------------------------------- + +As with file handles, the kernel allocates the inode structures +dynamically, but can't free them yet. + +The value in inode-max denotes the maximum number of inode +handlers. This value should be 3-4 times larger than the value +in file-max, since stdin, stdout and network sockets also +need an inode struct to handle them. When you regularly run +out of inodes, you need to increase this value. + +The file inode-nr contains the first two items from +inode-state, so we'll skip to that file... + +Inode-state contains three actual numbers and four dummies. +The actual numbers are, in order of appearance, nr_inodes, +nr_free_inodes and preshrink. + +Nr_inodes stands for the number of inodes the system has +allocated, this can be slightly more than inode-max because +Linux allocates them one pageful at a time. + +Nr_free_inodes represents the number of free inodes (?) and +preshrink is nonzero when the nr_inodes > inode-max and the +system needs to prune the inode list instead of allocating +more. + + +overflowgid & overflowuid +------------------------- + +Some filesystems only support 16-bit UIDs and GIDs, although in Linux +UIDs and GIDs are 32 bits. When one of these filesystems is mounted +with writes enabled, any UID or GID that would exceed 65535 is translated +to a fixed value before being written to disk. + +These sysctls allow you to change the value of the fixed UID and GID. +The default is 65534. + + +pipe-user-pages-hard +-------------------- + +Maximum total number of pages a non-privileged user may allocate for pipes. +Once this limit is reached, no new pipes may be allocated until usage goes +below the limit again. When set to 0, no limit is applied, which is the default +setting. + + +pipe-user-pages-soft +-------------------- + +Maximum total number of pages a non-privileged user may allocate for pipes +before the pipe size gets limited to a single page. Once this limit is reached, +new pipes will be limited to a single page in size for this user in order to +limit total memory usage, and trying to increase them using fcntl() will be +denied until usage goes below the limit again. The default value allows to +allocate up to 1024 pipes at their default size. When set to 0, no limit is +applied. + + +protected_fifos +--------------- + +The intent of this protection is to avoid unintentional writes to +an attacker-controlled FIFO, where a program expected to create a regular +file. + +When set to "0", writing to FIFOs is unrestricted. + +When set to "1" don't allow O_CREAT open on FIFOs that we don't own +in world writable sticky directories, unless they are owned by the +owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + +This protection is based on the restrictions in Openwall. + + +protected_hardlinks +-------------------- + +A long-standing class of security issues is the hardlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given hardlink (i.e. a +root process follows a hardlink created by another user). Additionally, +on systems without separated partitions, this stops unauthorized users +from "pinning" vulnerable setuid/setgid files against being upgraded by +the administrator, or linking to special files. + +When set to "0", hardlink creation behavior is unrestricted. + +When set to "1" hardlinks cannot be created by users if they do not +already own the source file, or do not have read/write access to it. + +This protection is based on the restrictions in Openwall and grsecurity. + + +protected_regular +----------------- + +This protection is similar to protected_fifos, but it +avoids writes to an attacker-controlled regular file, where a program +expected to create one. + +When set to "0", writing to regular files is unrestricted. + +When set to "1" don't allow O_CREAT open on regular files that we +don't own in world writable sticky directories, unless they are +owned by the owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + + +protected_symlinks +------------------ + +A long-standing class of security issues is the symlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given symlink (i.e. a +root process follows a symlink belonging to another user). For a likely +incomplete list of hundreds of examples across the years, please see: +http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp + +When set to "0", symlink following behavior is unrestricted. + +When set to "1" symlinks are permitted to be followed only when outside +a sticky world-writable directory, or when the uid of the symlink and +follower match, or when the directory owner matches the symlink's owner. + +This protection is based on the restrictions in Openwall and grsecurity. + + +suid_dumpable: +-------------- + +This value can be used to query and set the core dump mode for setuid +or otherwise protected/tainted binaries. The modes are + += ========== =============================================================== +0 (default) traditional behaviour. Any process which has changed + privilege levels or is execute only will not be dumped. +1 (debug) all processes dump core when possible. The core dump is + owned by the current user and no security is applied. This is + intended for system debugging situations only. + Ptrace is unchecked. + This is insecure as it allows regular users to examine the + memory contents of privileged processes. +2 (suidsafe) any binary which normally would not be dumped is dumped + anyway, but only if the "core_pattern" kernel sysctl is set to + either a pipe handler or a fully qualified path. (For more + details on this limitation, see CVE-2006-2451.) This mode is + appropriate when administrators are attempting to debug + problems in a normal environment, and either have a core dump + pipe handler that knows to treat privileged core dumps with + care, or specific directory defined for catching core dumps. + If a core dump happens without a pipe handler or fully + qualified path, a message will be emitted to syslog warning + about the lack of a correct setting. += ========== =============================================================== + + +super-max & super-nr +-------------------- + +These numbers control the maximum number of superblocks, and +thus the maximum number of mounted filesystems the kernel +can have. You only need to increase super-max if you need to +mount more filesystems than the current value in super-max +allows you to. + + +aio-nr & aio-max-nr +------------------- + +aio-nr shows the current system-wide number of asynchronous io +requests. aio-max-nr allows you to change the maximum value +aio-nr can grow to. + + +mount-max +--------- + +This denotes the maximum number of mounts that may exist +in a mount namespace. + + + +2. /proc/sys/fs/binfmt_misc +=========================== + +Documentation for the files in /proc/sys/fs/binfmt_misc is +in Documentation/admin-guide/binfmt-misc.rst. + + +3. /proc/sys/fs/mqueue - POSIX message queues filesystem +======================================================== + + +The "mqueue" filesystem provides the necessary kernel features to enable the +creation of a user space library that implements the POSIX message queues +API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System +Interfaces specification.) + +The "mqueue" filesystem contains values for determining/setting the amount of +resources used by the file system. + +/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the +maximum number of message queues allowed on the system. + +/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the +maximum number of messages in a queue value. In fact it is the limiting value +for another (user) limit which is set in mq_open invocation. This attribute of +a queue must be less or equal then msg_max. + +/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the +maximum message size value (it is every message queue's attribute set during +its creation). + +/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the +default number of messages in a queue value if attr parameter of mq_open(2) is +NULL. If it exceed msg_max, the default value is initialized msg_max. + +/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting +the default message size value if attr parameter of mq_open(2) is NULL. If it +exceed msgsize_max, the default value is initialized msgsize_max. + +4. /proc/sys/fs/epoll - Configuration options for the epoll interface +===================================================================== + +This directory contains configuration options for the epoll(7) interface. + +max_user_watches +---------------- + +Every epoll file descriptor can store a number of files to be monitored +for event readiness. Each one of these monitored files constitutes a "watch". +This configuration option sets the maximum number of "watches" that are +allowed for each user. +Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes +on a 64bit one. +The current default value for max_user_watches is the 1/32 of the available +low memory, divided for the "watch" cost in bytes. diff --git a/Documentation/admin-guide/sysctl/index.rst b/Documentation/admin-guide/sysctl/index.rst new file mode 100644 index 000000000000..03346f98c7b9 --- /dev/null +++ b/Documentation/admin-guide/sysctl/index.rst @@ -0,0 +1,98 @@ +=========================== +Documentation for /proc/sys +=========================== + +Copyright (c) 1998, 1999, Rik van Riel + +------------------------------------------------------------------------------ + +'Why', I hear you ask, 'would anyone even _want_ documentation +for them sysctl files? If anybody really needs it, it's all in +the source...' + +Well, this documentation is written because some people either +don't know they need to tweak something, or because they don't +have the time or knowledge to read the source code. + +Furthermore, the programmers who built sysctl have built it to +be actually used, not just for the fun of programming it :-) + +------------------------------------------------------------------------------ + +Legal blurb: + +As usual, there are two main things to consider: + +1. you get what you pay for +2. it's free + +The consequences are that I won't guarantee the correctness of +this document, and if you come to me complaining about how you +screwed up your system because of wrong documentation, I won't +feel sorry for you. I might even laugh at you... + +But of course, if you _do_ manage to screw up your system using +only the sysctl options used in this file, I'd like to hear of +it. Not only to have a great laugh, but also to make sure that +you're the last RTFMing person to screw up. + +In short, e-mail your suggestions, corrections and / or horror +stories to: + +Rik van Riel. + +-------------------------------------------------------------- + +Introduction +============ + +Sysctl is a means of configuring certain aspects of the kernel +at run-time, and the /proc/sys/ directory is there so that you +don't even need special tools to do it! +In fact, there are only four things needed to use these config +facilities: + +- a running Linux system +- root access +- common sense (this is especially hard to come by these days) +- knowledge of what all those values mean + +As a quick 'ls /proc/sys' will show, the directory consists of +several (arch-dependent?) subdirs. Each subdir is mainly about +one part of the kernel, so you can do configuration on a piece +by piece basis, or just some 'thematic frobbing'. + +This documentation is about: + +=============== =============================================================== +abi/ execution domains & personalities +debug/ +dev/ device specific information (eg dev/cdrom/info) +fs/ specific filesystems + filehandle, inode, dentry and quota tuning + binfmt_misc +kernel/ global kernel info / tuning + miscellaneous stuff +net/ networking stuff, for documentation look in: + +proc/ +sunrpc/ SUN Remote Procedure Call (NFS) +vm/ memory management tuning + buffer and cache management +user/ Per user per user namespace limits +=============== =============================================================== + +These are the subdirs I have on my system. There might be more +or other subdirs in another setup. If you see another dir, I'd +really like to hear about it :-) + +.. toctree:: + :maxdepth: 1 + + abi + fs + kernel + net + sunrpc + user + vm diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst new file mode 100644 index 000000000000..a0c1d4ce403a --- /dev/null +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -0,0 +1,1177 @@ +=================================== +Documentation for /proc/sys/kernel/ +=================================== + +kernel version 2.2.10 + +Copyright (c) 1998, 1999, Rik van Riel + +Copyright (c) 2009, Shen Feng + +For general info and legal blurb, please look in index.rst. + +------------------------------------------------------------------------------ + +This file contains documentation for the sysctl files in +/proc/sys/kernel/ and is valid for Linux kernel version 2.2. + +The files in this directory can be used to tune and monitor +miscellaneous and general things in the operation of the Linux +kernel. Since some of the files _can_ be used to screw up your +system, it is advisable to read both documentation and source +before actually making adjustments. + +Currently, these files might (depending on your configuration) +show up in /proc/sys/kernel: + +- acct +- acpi_video_flags +- auto_msgmni +- bootloader_type [ X86 only ] +- bootloader_version [ X86 only ] +- cap_last_cap +- core_pattern +- core_pipe_limit +- core_uses_pid +- ctrl-alt-del +- dmesg_restrict +- domainname +- hostname +- hotplug +- hardlockup_all_cpu_backtrace +- hardlockup_panic +- hung_task_panic +- hung_task_check_count +- hung_task_timeout_secs +- hung_task_check_interval_secs +- hung_task_warnings +- hyperv_record_panic_msg +- kexec_load_disabled +- kptr_restrict +- l2cr [ PPC only ] +- modprobe ==> Documentation/debugging-modules.txt +- modules_disabled +- msg_next_id [ sysv ipc ] +- msgmax +- msgmnb +- msgmni +- nmi_watchdog +- osrelease +- ostype +- overflowgid +- overflowuid +- panic +- panic_on_oops +- panic_on_stackoverflow +- panic_on_unrecovered_nmi +- panic_on_warn +- panic_print +- panic_on_rcu_stall +- perf_cpu_time_max_percent +- perf_event_paranoid +- perf_event_max_stack +- perf_event_mlock_kb +- perf_event_max_contexts_per_stack +- pid_max +- powersave-nap [ PPC only ] +- printk +- printk_delay +- printk_ratelimit +- printk_ratelimit_burst +- pty ==> Documentation/filesystems/devpts.txt +- randomize_va_space +- real-root-dev ==> Documentation/admin-guide/initrd.rst +- reboot-cmd [ SPARC only ] +- rtsig-max +- rtsig-nr +- sched_energy_aware +- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst +- sem +- sem_next_id [ sysv ipc ] +- sg-big-buff [ generic SCSI device (sg) ] +- shm_next_id [ sysv ipc ] +- shm_rmid_forced +- shmall +- shmmax [ sysv ipc ] +- shmmni +- softlockup_all_cpu_backtrace +- soft_watchdog +- stack_erasing +- stop-a [ SPARC only ] +- sysrq ==> Documentation/admin-guide/sysrq.rst +- sysctl_writes_strict +- tainted ==> Documentation/admin-guide/tainted-kernels.rst +- threads-max +- unknown_nmi_panic +- watchdog +- watchdog_thresh +- version + + +acct: +===== + +highwater lowwater frequency + +If BSD-style process accounting is enabled these values control +its behaviour. If free space on filesystem where the log lives +goes below % accounting suspends. If free space gets +above % accounting resumes. determines +how often do we check the amount of free space (value is in +seconds). Default: +4 2 30 +That is, suspend accounting if there left <= 2% free; resume it +if we got >=4%; consider information about amount of free space +valid for 30 seconds. + + +acpi_video_flags: +================= + +flags + +See Doc*/kernel/power/video.txt, it allows mode of video boot to be +set during run time. + + +auto_msgmni: +============ + +This variable has no effect and may be removed in future kernel +releases. Reading it always returns 0. +Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni +upon memory add/remove or upon ipc namespace creation/removal. +Echoing "1" into this file enabled msgmni automatic recomputing. +Echoing "0" turned it off. auto_msgmni default value was 1. + + +bootloader_type: +================ + +x86 bootloader identification + +This gives the bootloader type number as indicated by the bootloader, +shifted left by 4, and OR'd with the low four bits of the bootloader +version. The reason for this encoding is that this used to match the +type_of_loader field in the kernel header; the encoding is kept for +backwards compatibility. That is, if the full bootloader type number +is 0x15 and the full version number is 0x234, this file will contain +the value 340 = 0x154. + +See the type_of_loader and ext_loader_type fields in +Documentation/x86/boot.rst for additional information. + + +bootloader_version: +=================== + +x86 bootloader version + +The complete bootloader version number. In the example above, this +file will contain the value 564 = 0x234. + +See the type_of_loader and ext_loader_ver fields in +Documentation/x86/boot.rst for additional information. + + +cap_last_cap: +============= + +Highest valid capability of the running kernel. Exports +CAP_LAST_CAP from the kernel. + + +core_pattern: +============= + +core_pattern is used to specify a core dumpfile pattern name. + +* max length 127 characters; default value is "core" +* core_pattern is used as a pattern template for the output filename; + certain string patterns (beginning with '%') are substituted with + their actual values. +* backward compatibility with core_uses_pid: + + If core_pattern does not include "%p" (default does not) + and core_uses_pid is set, then .PID will be appended to + the filename. + +* corename format specifiers:: + + % '%' is dropped + %% output one '%' + %p pid + %P global pid (init PID namespace) + %i tid + %I global tid (init PID namespace) + %u uid (in initial user namespace) + %g gid (in initial user namespace) + %d dump mode, matches PR_SET_DUMPABLE and + /proc/sys/fs/suid_dumpable + %s signal number + %t UNIX time of dump + %h hostname + %e executable filename (may be shortened) + %E executable path + % both are dropped + +* If the first character of the pattern is a '|', the kernel will treat + the rest of the pattern as a command to run. The core dump will be + written to the standard input of that program instead of to a file. + + +core_pipe_limit: +================ + +This sysctl is only applicable when core_pattern is configured to pipe +core files to a user space helper (when the first character of +core_pattern is a '|', see above). When collecting cores via a pipe +to an application, it is occasionally useful for the collecting +application to gather data about the crashing process from its +/proc/pid directory. In order to do this safely, the kernel must wait +for the collecting process to exit, so as not to remove the crashing +processes proc files prematurely. This in turn creates the +possibility that a misbehaving userspace collecting process can block +the reaping of a crashed process simply by never exiting. This sysctl +defends against that. It defines how many concurrent crashing +processes may be piped to user space applications in parallel. If +this value is exceeded, then those crashing processes above that value +are noted via the kernel log and their cores are skipped. 0 is a +special value, indicating that unlimited processes may be captured in +parallel, but that no waiting will take place (i.e. the collecting +process is not guaranteed access to /proc//). This +value defaults to 0. + + +core_uses_pid: +============== + +The default coredump filename is "core". By setting +core_uses_pid to 1, the coredump filename becomes core.PID. +If core_pattern does not include "%p" (default does not) +and core_uses_pid is set, then .PID will be appended to +the filename. + + +ctrl-alt-del: +============= + +When the value in this file is 0, ctrl-alt-del is trapped and +sent to the init(1) program to handle a graceful restart. +When, however, the value is > 0, Linux's reaction to a Vulcan +Nerve Pinch (tm) will be an immediate reboot, without even +syncing its dirty buffers. + +Note: + when a program (like dosemu) has the keyboard in 'raw' + mode, the ctrl-alt-del is intercepted by the program before it + ever reaches the kernel tty layer, and it's up to the program + to decide what to do with it. + + +dmesg_restrict: +=============== + +This toggle indicates whether unprivileged users are prevented +from using dmesg(8) to view messages from the kernel's log buffer. +When dmesg_restrict is set to (0) there are no restrictions. When +dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use +dmesg(8). + +The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the +default value of dmesg_restrict. + + +domainname & hostname: +====================== + +These files can be used to set the NIS/YP domainname and the +hostname of your box in exactly the same way as the commands +domainname and hostname, i.e.:: + + # echo "darkstar" > /proc/sys/kernel/hostname + # echo "mydomain" > /proc/sys/kernel/domainname + +has the same effect as:: + + # hostname "darkstar" + # domainname "mydomain" + +Note, however, that the classic darkstar.frop.org has the +hostname "darkstar" and DNS (Internet Domain Name Server) +domainname "frop.org", not to be confused with the NIS (Network +Information Service) or YP (Yellow Pages) domainname. These two +domain names are in general different. For a detailed discussion +see the hostname(1) man page. + + +hardlockup_all_cpu_backtrace: +============================= + +This value controls the hard lockup detector behavior when a hard +lockup condition is detected as to whether or not to gather further +debug information. If enabled, arch-specific all-CPU stack dumping +will be initiated. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + + +hardlockup_panic: +================= + +This parameter can be used to control whether the kernel panics +when a hard lockup is detected. + + 0 - don't panic on hard lockup + 1 - panic on hard lockup + +See Documentation/lockup-watchdogs.txt for more information. This can +also be set using the nmi_watchdog kernel parameter. + + +hotplug: +======== + +Path for the hotplug policy agent. +Default value is "/sbin/hotplug". + + +hung_task_panic: +================ + +Controls the kernel's behavior when a hung task is detected. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0: continue operation. This is the default behavior. + +1: panic immediately. + + +hung_task_check_count: +====================== + +The upper bound on the number of tasks that are checked. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + + +hung_task_timeout_secs: +======================= + +When a task in D state did not get scheduled +for more than this value report a warning. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0: means infinite timeout - no checking done. + +Possible values to set are in range {0..LONG_MAX/HZ}. + + +hung_task_check_interval_secs: +============================== + +Hung task check interval. If hung task checking is enabled +(see hung_task_timeout_secs), the check is done every +hung_task_check_interval_secs seconds. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0 (default): means use hung_task_timeout_secs as checking interval. +Possible values to set are in range {0..LONG_MAX/HZ}. + + +hung_task_warnings: +=================== + +The maximum number of warnings to report. During a check interval +if a hung task is detected, this value is decreased by 1. +When this value reaches 0, no more warnings will be reported. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +-1: report an infinite number of warnings. + + +hyperv_record_panic_msg: +======================== + +Controls whether the panic kmsg data should be reported to Hyper-V. + +0: do not report panic kmsg data. + +1: report the panic kmsg data. This is the default behavior. + + +kexec_load_disabled: +==================== + +A toggle indicating if the kexec_load syscall has been disabled. This +value defaults to 0 (false: kexec_load enabled), but can be set to 1 +(true: kexec_load disabled). Once true, kexec can no longer be used, and +the toggle cannot be set back to false. This allows a kexec image to be +loaded before disabling the syscall, allowing a system to set up (and +later use) an image without it being altered. Generally used together +with the "modules_disabled" sysctl. + + +kptr_restrict: +============== + +This toggle indicates whether restrictions are placed on +exposing kernel addresses via /proc and other interfaces. + +When kptr_restrict is set to 0 (the default) the address is hashed before +printing. (This is the equivalent to %p.) + +When kptr_restrict is set to (1), kernel pointers printed using the %pK +format specifier will be replaced with 0's unless the user has CAP_SYSLOG +and effective user and group ids are equal to the real ids. This is +because %pK checks are done at read() time rather than open() time, so +if permissions are elevated between the open() and the read() (e.g via +a setuid binary) then %pK will not leak kernel pointers to unprivileged +users. Note, this is a temporary solution only. The correct long-term +solution is to do the permission checks at open() time. Consider removing +world read permissions from files that use %pK, and using dmesg_restrict +to protect against uses of %pK in dmesg(8) if leaking kernel pointer +values to unprivileged users is a concern. + +When kptr_restrict is set to (2), kernel pointers printed using +%pK will be replaced with 0's regardless of privileges. + + +l2cr: (PPC only) +================ + +This flag controls the L2 cache of G3 processor boards. If +0, the cache is disabled. Enabled if nonzero. + + +modules_disabled: +================= + +A toggle value indicating if modules are allowed to be loaded +in an otherwise modular kernel. This toggle defaults to off +(0), but can be set true (1). Once true, modules can be +neither loaded nor unloaded, and the toggle cannot be set back +to false. Generally used with the "kexec_load_disabled" toggle. + + +msg_next_id, sem_next_id, and shm_next_id: +========================================== + +These three toggles allows to specify desired id for next allocated IPC +object: message, semaphore or shared memory respectively. + +By default they are equal to -1, which means generic allocation logic. +Possible values to set are in range {0..INT_MAX}. + +Notes: + 1) kernel doesn't guarantee, that new object will have desired id. So, + it's up to userspace, how to handle an object with "wrong" id. + 2) Toggle with non-default value will be set back to -1 by kernel after + successful IPC object allocation. If an IPC object allocation syscall + fails, it is undefined if the value remains unmodified or is reset to -1. + + +nmi_watchdog: +============= + +This parameter can be used to control the NMI watchdog +(i.e. the hard lockup detector) on x86 systems. + +0 - disable the hard lockup detector + +1 - enable the hard lockup detector + +The hard lockup detector monitors each CPU for its ability to respond to +timer interrupts. The mechanism utilizes CPU performance counter registers +that are programmed to generate Non-Maskable Interrupts (NMIs) periodically +while a CPU is busy. Hence, the alternative name 'NMI watchdog'. + +The NMI watchdog is disabled by default if the kernel is running as a guest +in a KVM virtual machine. This default can be overridden by adding:: + + nmi_watchdog=1 + +to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst). + + +numa_balancing: +=============== + +Enables/disables automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes +that access it often. + +Enables/disables automatic NUMA memory balancing. On NUMA machines, there +is a performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing memory +by periodically unmapping pages and later trapping a page fault. At the +time of the page fault, it is determined if the data being accessed should +be migrated to a local memory node. + +The unmapping of pages and trapping faults incur additional overhead that +ideally is offset by improved memory locality but there is no universal +guarantee. If the target workload is already bound to NUMA nodes then this +feature should be disabled. Otherwise, if the system overhead from the +feature is too high then the rate the kernel samples for NUMA hinting +faults may be controlled by the numa_balancing_scan_period_min_ms, +numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, +numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. + +numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb +=============================================================================================================================== + + +Automatic NUMA balancing scans tasks address space and unmaps pages to +detect if pages are properly placed or if the data should be migrated to a +memory node local to where the task is running. Every "scan delay" the task +scans the next "scan size" number of pages in its address space. When the +end of the address space is reached the scanner restarts from the beginning. + +In combination, the "scan delay" and "scan size" determine the scan rate. +When "scan delay" decreases, the scan rate increases. The scan delay and +hence the scan rate of every task is adaptive and depends on historical +behaviour. If pages are properly placed then the scan delay increases, +otherwise the scan delay decreases. The "scan size" is not adaptive but +the higher the "scan size", the higher the scan rate. + +Higher scan rates incur higher system overhead as page faults must be +trapped and potentially data must be migrated. However, the higher the scan +rate, the more quickly a tasks memory is migrated to a local node if the +workload pattern changes and minimises performance impact due to remote +memory accesses. These sysctls control the thresholds for scan delays and +the number of pages scanned. + +numa_balancing_scan_period_min_ms is the minimum time in milliseconds to +scan a tasks virtual memory. It effectively controls the maximum scanning +rate for each task. + +numa_balancing_scan_delay_ms is the starting "scan delay" used for a task +when it initially forks. + +numa_balancing_scan_period_max_ms is the maximum time in milliseconds to +scan a tasks virtual memory. It effectively controls the minimum scanning +rate for each task. + +numa_balancing_scan_size_mb is how many megabytes worth of pages are +scanned for a given scan. + + +osrelease, ostype & version: +============================ + +:: + + # cat osrelease + 2.1.88 + # cat ostype + Linux + # cat version + #5 Wed Feb 25 21:49:24 MET 1998 + +The files osrelease and ostype should be clear enough. Version +needs a little more clarification however. The '#5' means that +this is the fifth kernel built from this source base and the +date behind it indicates the time the kernel was built. +The only way to tune these values is to rebuild the kernel :-) + + +overflowgid & overflowuid: +========================== + +if your architecture did not always support 32-bit UIDs (i.e. arm, +i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to +applications that use the old 16-bit UID/GID system calls, if the +actual UID or GID would exceed 65535. + +These sysctls allow you to change the value of the fixed UID and GID. +The default is 65534. + + +panic: +====== + +The value in this file represents the number of seconds the kernel +waits before rebooting on a panic. When you use the software watchdog, +the recommended setting is 60. + + +panic_on_io_nmi: +================ + +Controls the kernel's behavior when a CPU receives an NMI caused by +an IO error. + +0: try to continue operation (default) + +1: panic immediately. The IO error triggered an NMI. This indicates a + serious system condition which could result in IO data corruption. + Rather than continuing, panicking might be a better choice. Some + servers issue this sort of NMI when the dump button is pushed, + and you can use this option to take a crash dump. + + +panic_on_oops: +============== + +Controls the kernel's behaviour when an oops or BUG is encountered. + +0: try to continue operation + +1: panic immediately. If the `panic` sysctl is also non-zero then the + machine will be rebooted. + + +panic_on_stackoverflow: +======================= + +Controls the kernel's behavior when detecting the overflows of +kernel, IRQ and exception stacks except a user stack. +This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. + +0: try to continue operation. + +1: panic immediately. + + +panic_on_unrecovered_nmi: +========================= + +The default Linux behaviour on an NMI of either memory or unknown is +to continue operation. For many environments such as scientific +computing it is preferable that the box is taken out and the error +dealt with than an uncorrected parity/ECC error get propagated. + +A small number of systems do generate NMI's for bizarre random reasons +such as power management so the default is off. That sysctl works like +the existing panic controls already in that directory. + + +panic_on_warn: +============== + +Calls panic() in the WARN() path when set to 1. This is useful to avoid +a kernel rebuild when attempting to kdump at the location of a WARN(). + +0: only WARN(), default behaviour. + +1: call panic() after printing out WARN() location. + + +panic_print: +============ + +Bitmask for printing system info when panic happens. User can chose +combination of the following bits: + +===== ======================================== +bit 0 print all tasks info +bit 1 print system memory info +bit 2 print timer info +bit 3 print locks info if CONFIG_LOCKDEP is on +bit 4 print ftrace buffer +===== ======================================== + +So for example to print tasks and memory info on panic, user can:: + + echo 3 > /proc/sys/kernel/panic_print + + +panic_on_rcu_stall: +=================== + +When set to 1, calls panic() after RCU stall detection messages. This +is useful to define the root cause of RCU stalls using a vmcore. + +0: do not panic() when RCU stall takes place, default behavior. + +1: panic() after printing RCU stall messages. + + +perf_cpu_time_max_percent: +========================== + +Hints to the kernel how much CPU time it should be allowed to +use to handle perf sampling events. If the perf subsystem +is informed that its samples are exceeding this limit, it +will drop its sampling frequency to attempt to reduce its CPU +usage. + +Some perf sampling happens in NMIs. If these samples +unexpectedly take too long to execute, the NMIs can become +stacked up next to each other so much that nothing else is +allowed to execute. + +0: + disable the mechanism. Do not monitor or correct perf's + sampling rate no matter how CPU time it takes. + +1-100: + attempt to throttle perf's sample rate to this + percentage of CPU. Note: the kernel calculates an + "expected" length of each sample event. 100 here means + 100% of that expected length. Even if this is set to + 100, you may still see sample throttling if this + length is exceeded. Set to 0 if you truly do not care + how much CPU is consumed. + + +perf_event_paranoid: +==================== + +Controls use of the performance events system by unprivileged +users (without CAP_SYS_ADMIN). The default value is 2. + +=== ================================================================== + -1 Allow use of (almost) all events by all users + + Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK + +>=0 Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN + + Disallow raw tracepoint access by users without CAP_SYS_ADMIN + +>=1 Disallow CPU event access by users without CAP_SYS_ADMIN + +>=2 Disallow kernel profiling by users without CAP_SYS_ADMIN +=== ================================================================== + + +perf_event_max_stack: +===================== + +Controls maximum number of stack frames to copy for (attr.sample_type & +PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using +'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 127. + + +perf_event_mlock_kb: +==================== + +Control size of per-cpu ring buffer not counted agains mlock limit. + +The default value is 512 + 1 page + + +perf_event_max_contexts_per_stack: +================================== + +Controls maximum number of stack frame context entries for +(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for +instance, when using 'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 8. + + +pid_max: +======== + +PID allocation wrap value. When the kernel's next PID value +reaches this value, it wraps back to a minimum PID value. +PIDs of value pid_max or larger are not allocated. + + +ns_last_pid: +============ + +The last pid allocated in the current (the one task using this sysctl +lives in) pid namespace. When selecting a pid for a next task on fork +kernel tries to allocate a number starting from this one. + + +powersave-nap: (PPC only) +========================= + +If set, Linux-PPC will use the 'nap' mode of powersaving, +otherwise the 'doze' mode will be used. + +============================================================== + +printk: +======= + +The four values in printk denote: console_loglevel, +default_message_loglevel, minimum_console_loglevel and +default_console_loglevel respectively. + +These values influence printk() behavior when printing or +logging error messages. See 'man 2 syslog' for more info on +the different loglevels. + +- console_loglevel: + messages with a higher priority than + this will be printed to the console +- default_message_loglevel: + messages without an explicit priority + will be printed with this priority +- minimum_console_loglevel: + minimum (highest) value to which + console_loglevel can be set +- default_console_loglevel: + default value for console_loglevel + + +printk_delay: +============= + +Delay each printk message in printk_delay milliseconds + +Value from 0 - 10000 is allowed. + + +printk_ratelimit: +================= + +Some warning messages are rate limited. printk_ratelimit specifies +the minimum length of time between these messages (in jiffies), by +default we allow one every 5 seconds. + +A value of 0 will disable rate limiting. + + +printk_ratelimit_burst: +======================= + +While long term we enforce one message per printk_ratelimit +seconds, we do allow a burst of messages to pass through. +printk_ratelimit_burst specifies the number of messages we can +send before ratelimiting kicks in. + + +printk_devkmsg: +=============== + +Control the logging to /dev/kmsg from userspace: + +ratelimit: + default, ratelimited + +on: unlimited logging to /dev/kmsg from userspace + +off: logging to /dev/kmsg disabled + +The kernel command line parameter printk.devkmsg= overrides this and is +a one-time setting until next reboot: once set, it cannot be changed by +this sysctl interface anymore. + + +randomize_va_space: +=================== + +This option can be used to select the type of process address +space randomization that is used in the system, for architectures +that support this feature. + +== =========================================================================== +0 Turn the process address space randomization off. This is the + default for architectures that do not support this feature anyways, + and kernels that are booted with the "norandmaps" parameter. + +1 Make the addresses of mmap base, stack and VDSO page randomized. + This, among other things, implies that shared libraries will be + loaded to random addresses. Also for PIE-linked binaries, the + location of code start is randomized. This is the default if the + CONFIG_COMPAT_BRK option is enabled. + +2 Additionally enable heap randomization. This is the default if + CONFIG_COMPAT_BRK is disabled. + + There are a few legacy applications out there (such as some ancient + versions of libc.so.5 from 1996) that assume that brk area starts + just after the end of the code+bss. These applications break when + start of the brk area is randomized. There are however no known + non-legacy applications that would be broken this way, so for most + systems it is safe to choose full randomization. + + Systems with ancient and/or broken binaries should be configured + with CONFIG_COMPAT_BRK enabled, which excludes the heap from process + address space randomization. +== =========================================================================== + + +reboot-cmd: (Sparc only) +======================== + +??? This seems to be a way to give an argument to the Sparc +ROM/Flash boot loader. Maybe to tell it what to do after +rebooting. ??? + + +rtsig-max & rtsig-nr: +===================== + +The file rtsig-max can be used to tune the maximum number +of POSIX realtime (queued) signals that can be outstanding +in the system. + +rtsig-nr shows the number of RT signals currently queued. + + +sched_energy_aware: +=================== + +Enables/disables Energy Aware Scheduling (EAS). EAS starts +automatically on platforms where it can run (that is, +platforms with asymmetric CPU topologies and having an Energy +Model available). If your platform happens to meet the +requirements for EAS but you do not want to use it, change +this value to 0. + + +sched_schedstats: +================= + +Enables/disables scheduler statistics. Enabling this feature +incurs a small amount of overhead in the scheduler but is +useful for debugging and performance tuning. + + +sg-big-buff: +============ + +This file shows the size of the generic SCSI (sg) buffer. +You can't tune it just yet, but you could change it on +compile time by editing include/scsi/sg.h and changing +the value of SG_BIG_BUFF. + +There shouldn't be any reason to change this value. If +you can come up with one, you probably know what you +are doing anyway :) + + +shmall: +======= + +This parameter sets the total amount of shared memory pages that +can be used system wide. Hence, SHMALL should always be at least +ceil(shmmax/PAGE_SIZE). + +If you are not sure what the default PAGE_SIZE is on your Linux +system, you can run the following command: + + # getconf PAGE_SIZE + + +shmmax: +======= + +This value can be used to query and set the run time limit +on the maximum shared memory segment size that can be created. +Shared memory segments up to 1Gb are now supported in the +kernel. This value defaults to SHMMAX. + + +shm_rmid_forced: +================ + +Linux lets you set resource limits, including how much memory one +process can consume, via setrlimit(2). Unfortunately, shared memory +segments are allowed to exist without association with any process, and +thus might not be counted against any resource limits. If enabled, +shared memory segments are automatically destroyed when their attach +count becomes zero after a detach or a process termination. It will +also destroy segments that were created, but never attached to, on exit +from the process. The only use left for IPC_RMID is to immediately +destroy an unattached segment. Of course, this breaks the way things are +defined, so some applications might stop working. Note that this +feature will do you no good unless you also configure your resource +limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't +need this. + +Note that if you change this from 0 to 1, already created segments +without users and with a dead originative process will be destroyed. + + +sysctl_writes_strict: +===================== + +Control how file position affects the behavior of updating sysctl values +via the /proc/sys interface: + + == ====================================================================== + -1 Legacy per-write sysctl value handling, with no printk warnings. + Each write syscall must fully contain the sysctl value to be + written, and multiple writes on the same sysctl file descriptor + will rewrite the sysctl value, regardless of file position. + 0 Same behavior as above, but warn about processes that perform writes + to a sysctl file descriptor when the file position is not 0. + 1 (default) Respect file position when writing sysctl strings. Multiple + writes will append to the sysctl value buffer. Anything past the max + length of the sysctl value buffer will be ignored. Writes to numeric + sysctl entries must always be at file position 0 and the value must + be fully contained in the buffer sent in the write syscall. + == ====================================================================== + + +softlockup_all_cpu_backtrace: +============================= + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + + +soft_watchdog: +============== + +This parameter can be used to control the soft lockup detector. + + 0 - disable the soft lockup detector + + 1 - enable the soft lockup detector + +The soft lockup detector monitors CPUs for threads that are hogging the CPUs +without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads +from running. The mechanism depends on the CPUs ability to respond to timer +interrupts which are needed for the 'watchdog/N' threads to be woken up by +the watchdog timer function, otherwise the NMI watchdog - if enabled - can +detect a hard lockup condition. + + +stack_erasing: +============== + +This parameter can be used to control kernel stack erasing at the end +of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. + +That erasing reduces the information which kernel stack leak bugs +can reveal and blocks some uninitialized stack variable attacks. +The tradeoff is the performance impact: on a single CPU system kernel +compilation sees a 1% slowdown, other systems and workloads may vary. + + 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. + + 1: kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. + + +tainted +======= + +Non-zero if the kernel has been tainted. Numeric values, which can be +ORed together. The letters are seen in "Tainted" line of Oops reports. + +====== ===== ============================================================== + 1 `(P)` proprietary module was loaded + 2 `(F)` module was force loaded + 4 `(S)` SMP kernel oops on an officially SMP incapable processor + 8 `(R)` module was force unloaded + 16 `(M)` processor reported a Machine Check Exception (MCE) + 32 `(B)` bad page referenced or some unexpected page flags + 64 `(U)` taint requested by userspace application + 128 `(D)` kernel died recently, i.e. there was an OOPS or BUG + 256 `(A)` an ACPI table was overridden by user + 512 `(W)` kernel issued warning + 1024 `(C)` staging driver was loaded + 2048 `(I)` workaround for bug in platform firmware applied + 4096 `(O)` externally-built ("out-of-tree") module was loaded + 8192 `(E)` unsigned module was loaded + 16384 `(L)` soft lockup occurred + 32768 `(K)` kernel has been live patched + 65536 `(X)` Auxiliary taint, defined and used by for distros +131072 `(T)` The kernel was built with the struct randomization plugin +====== ===== ============================================================== + +See Documentation/admin-guide/tainted-kernels.rst for more information. + + +threads-max: +============ + +This value controls the maximum number of threads that can be created +using fork(). + +During initialization the kernel sets this value such that even if the +maximum number of threads is created, the thread structures occupy only +a part (1/8th) of the available RAM pages. + +The minimum value that can be written to threads-max is 20. + +The maximum value that can be written to threads-max is given by the +constant FUTEX_TID_MASK (0x3fffffff). + +If a value outside of this range is written to threads-max an error +EINVAL occurs. + +The value written is checked against the available RAM pages. If the +thread structures would occupy too much (more than 1/8th) of the +available RAM pages threads-max is reduced accordingly. + + +unknown_nmi_panic: +================== + +The value in this file affects behavior of handling NMI. When the +value is non-zero, unknown NMI is trapped and then panic occurs. At +that time, kernel debugging information is displayed on console. + +NMI switch that most IA32 servers have fires unknown NMI up, for +example. If a system hangs up, try pressing the NMI switch. + + +watchdog: +========= + +This parameter can be used to disable or enable the soft lockup detector +_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time. + + 0 - disable both lockup detectors + + 1 - enable both lockup detectors + +The soft lockup detector and the NMI watchdog can also be disabled or +enabled individually, using the soft_watchdog and nmi_watchdog parameters. +If the watchdog parameter is read, for example by executing:: + + cat /proc/sys/kernel/watchdog + +the output of this command (0 or 1) shows the logical OR of soft_watchdog +and nmi_watchdog. + + +watchdog_cpumask: +================= + +This value can be used to control on which cpus the watchdog may run. +The default cpumask is all possible cores, but if NO_HZ_FULL is +enabled in the kernel config, and cores are specified with the +nohz_full= boot argument, those cores are excluded by default. +Offline cores can be included in this mask, and if the core is later +brought online, the watchdog will be started based on the mask value. + +Typically this value would only be touched in the nohz_full case +to re-enable cores that by default were not running the watchdog, +if a kernel lockup was suspected on those cores. + +The argument value is the standard cpulist format for cpumasks, +so for example to enable the watchdog on cores 0, 2, 3, and 4 you +might say:: + + echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask + + +watchdog_thresh: +================ + +This value can be used to control the frequency of hrtimer and NMI +events and the soft and hard lockup thresholds. The default threshold +is 10 seconds. + +The softlockup threshold is (2 * watchdog_thresh). Setting this +tunable to zero will disable lockup detection altogether. diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst new file mode 100644 index 000000000000..a7d44e71019d --- /dev/null +++ b/Documentation/admin-guide/sysctl/net.rst @@ -0,0 +1,461 @@ +================================ +Documentation for /proc/sys/net/ +================================ + +Copyright + +Copyright (c) 1999 + + - Terrehon Bowden + - Bodo Bauer + +Copyright (c) 2000 + + - Jorge Nerin + +Copyright (c) 2009 + + - Shen Feng + +For general info and legal blurb, please look in index.rst. + +------------------------------------------------------------------------------ + +This file contains the documentation for the sysctl files in +/proc/sys/net + +The interface to the networking parts of the kernel is located in +/proc/sys/net. The following table shows all possible subdirectories. You may +see only some of them, depending on your kernel's configuration. + + +Table : Subdirectories in /proc/sys/net + + ========= =================== = ========== ================== + Directory Content Directory Content + ========= =================== = ========== ================== + core General parameter appletalk Appletalk protocol + unix Unix domain sockets netrom NET/ROM + 802 E802 protocol ax25 AX25 + ethernet Ethernet protocol rose X.25 PLP layer + ipv4 IP version 4 x25 X.25 protocol + ipx IPX token-ring IBM token ring + bridge Bridging decnet DEC net + ipv6 IP version 6 tipc TIPC + ========= =================== = ========== ================== + +1. /proc/sys/net/core - Network core options +============================================ + +bpf_jit_enable +-------------- + +This enables the BPF Just in Time (JIT) compiler. BPF is a flexible +and efficient infrastructure allowing to execute bytecode at various +hook points. It is used in a number of Linux kernel subsystems such +as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints) +and security (e.g. seccomp). LLVM has a BPF back end that can compile +restricted C into a sequence of BPF instructions. After program load +through bpf(2) and passing a verifier in the kernel, a JIT will then +translate these BPF proglets into native CPU instructions. There are +two flavors of JITs, the newer eBPF JIT currently supported on: + + - x86_64 + - x86_32 + - arm64 + - arm32 + - ppc64 + - sparc64 + - mips64 + - s390x + - riscv + +And the older cBPF JIT supported on the following archs: + + - mips + - ppc + - sparc + +eBPF JITs are a superset of cBPF JITs, meaning the kernel will +migrate cBPF instructions into eBPF instructions and then JIT +compile them transparently. Older cBPF JITs can only translate +tcpdump filters, seccomp rules, etc, but not mentioned eBPF +programs loaded through bpf(2). + +Values: + + - 0 - disable the JIT (default value) + - 1 - enable the JIT + - 2 - enable the JIT and ask the compiler to emit traces on kernel log. + +bpf_jit_harden +-------------- + +This enables hardening for the BPF JIT compiler. Supported are eBPF +JIT backends. Enabling hardening trades off performance, but can +mitigate JIT spraying. + +Values: + + - 0 - disable JIT hardening (default value) + - 1 - enable JIT hardening for unprivileged users only + - 2 - enable JIT hardening for all users + +bpf_jit_kallsyms +---------------- + +When BPF JIT compiler is enabled, then compiled images are unknown +addresses to the kernel, meaning they neither show up in traces nor +in /proc/kallsyms. This enables export of these addresses, which can +be used for debugging/tracing. If bpf_jit_harden is enabled, this +feature is disabled. + +Values : + + - 0 - disable JIT kallsyms export (default value) + - 1 - enable JIT kallsyms export for privileged users only + +bpf_jit_limit +------------- + +This enforces a global limit for memory allocations to the BPF JIT +compiler in order to reject unprivileged JIT requests once it has +been surpassed. bpf_jit_limit contains the value of the global limit +in bytes. + +dev_weight +---------- + +The maximum number of packets that kernel can handle on a NAPI interrupt, +it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware +aggregated packet is counted as one packet in this context. + +Default: 64 + +dev_weight_rx_bias +------------------ + +RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function +of the driver for the per softirq cycle netdev_budget. This parameter influences +the proportion of the configured netdev_budget that is spent on RPS based packet +processing during RX softirq cycles. It is further meant for making current +dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack. +(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based +on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias). + +Default: 1 + +dev_weight_tx_bias +------------------ + +Scales the maximum number of packets that can be processed during a TX softirq cycle. +Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric +net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog. + +Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias). + +Default: 1 + +default_qdisc +------------- + +The default queuing discipline to use for network devices. This allows +overriding the default of pfifo_fast with an alternative. Since the default +queuing discipline is created without additional parameters so is best suited +to queuing disciplines that work well without configuration like stochastic +fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use +queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin +which require setting up classes and bandwidths. Note that physical multiqueue +interfaces still use mq as root qdisc, which in turn uses this default for its +leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead +default to noqueue. + +Default: pfifo_fast + +busy_read +--------- + +Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL) +Approximate time in us to busy loop waiting for packets on the device queue. +This sets the default value of the SO_BUSY_POLL socket option. +Can be set or overridden per socket by setting socket option SO_BUSY_POLL, +which is the preferred method of enabling. If you need to enable the feature +globally via sysctl, a value of 50 is recommended. + +Will increase power usage. + +Default: 0 (off) + +busy_poll +---------------- +Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL) +Approximate time in us to busy loop waiting for events. +Recommended value depends on the number of sockets you poll on. +For several sockets 50, for several hundreds 100. +For more than that you probably want to use epoll. +Note that only sockets with SO_BUSY_POLL set will be busy polled, +so you want to either selectively set SO_BUSY_POLL on those sockets or set +sysctl.net.busy_read globally. + +Will increase power usage. + +Default: 0 (off) + +rmem_default +------------ + +The default setting of the socket receive buffer in bytes. + +rmem_max +-------- + +The maximum receive socket buffer size in bytes. + +tstamp_allow_data +----------------- +Allow processes to receive tx timestamps looped together with the original +packet contents. If disabled, transmit timestamp requests from unprivileged +processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set. + +Default: 1 (on) + + +wmem_default +------------ + +The default setting (in bytes) of the socket send buffer. + +wmem_max +-------- + +The maximum send socket buffer size in bytes. + +message_burst and message_cost +------------------------------ + +These parameters are used to limit the warning messages written to the kernel +log from the networking code. They enforce a rate limit to make a +denial-of-service attack impossible. A higher message_cost factor, results in +fewer messages that will be written. Message_burst controls when messages will +be dropped. The default settings limit warning messages to one every five +seconds. + +warnings +-------- + +This sysctl is now unused. + +This was used to control console messages from the networking stack that +occur because of problems on the network like duplicate address or bad +checksums. + +These messages are now emitted at KERN_DEBUG and can generally be enabled +and controlled by the dynamic_debug facility. + +netdev_budget +------------- + +Maximum number of packets taken from all interfaces in one polling cycle (NAPI +poll). In one polling cycle interfaces which are registered to polling are +probed in a round-robin manner. Also, a polling cycle may not exceed +netdev_budget_usecs microseconds, even if netdev_budget has not been +exhausted. + +netdev_budget_usecs +--------------------- + +Maximum number of microseconds in one NAPI polling cycle. Polling +will exit when either netdev_budget_usecs have elapsed during the +poll cycle or the number of packets processed reaches netdev_budget. + +netdev_max_backlog +------------------ + +Maximum number of packets, queued on the INPUT side, when the interface +receives packets faster than kernel can process them. + +netdev_rss_key +-------------- + +RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is +randomly generated. +Some user space might need to gather its content even if drivers do not +provide ethtool -x support yet. + +:: + + myhost:~# cat /proc/sys/net/core/netdev_rss_key + 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total) + +File contains nul bytes if no driver ever called netdev_rss_key_fill() function. + +Note: + /proc/sys/net/core/netdev_rss_key contains 52 bytes of key, + but most drivers only use 40 bytes of it. + +:: + + myhost:~# ethtool -x eth0 + RX flow hash indirection table for eth0 with 8 RX ring(s): + 0: 0 1 2 3 4 5 6 7 + RSS hash key: + 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89 + +netdev_tstamp_prequeue +---------------------- + +If set to 0, RX packet timestamps can be sampled after RPS processing, when +the target CPU processes packets. It might give some delay on timestamps, but +permit to distribute the load on several cpus. + +If set to 1 (default), timestamps are sampled as soon as possible, before +queueing. + +optmem_max +---------- + +Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence +of struct cmsghdr structures with appended data. + +fb_tunnels_only_for_init_net +---------------------------- + +Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0, +sit0, ip6tnl0, ip6gre0) are automatically created when a new +network namespace is created, if corresponding tunnel is present +in initial network namespace. +If set to 1, these devices are not automatically created, and +user space is responsible for creating them if needed. + +Default : 0 (for compatibility reasons) + +devconf_inherit_init_net +------------------------ + +Controls if a new network namespace should inherit all current +settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By +default, we keep the current behavior: for IPv4 we inherit all current +settings from init_net and for IPv6 we reset all settings to default. + +If set to 1, both IPv4 and IPv6 settings are forced to inherit from +current ones in init_net. If set to 2, both IPv4 and IPv6 settings are +forced to reset to their default values. + +Default : 0 (for compatibility reasons) + +2. /proc/sys/net/unix - Parameters for Unix domain sockets +---------------------------------------------------------- + +There is only one file in this directory. +unix_dgram_qlen limits the max number of datagrams queued in Unix domain +socket's buffer. It will not take effect unless PF_UNIX flag is specified. + + +3. /proc/sys/net/ipv4 - IPV4 settings +------------------------------------- +Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for +descriptions of these entries. + + +4. Appletalk +------------ + +The /proc/sys/net/appletalk directory holds the Appletalk configuration data +when Appletalk is loaded. The configurable parameters are: + +aarp-expiry-time +---------------- + +The amount of time we keep an ARP entry before expiring it. Used to age out +old hosts. + +aarp-resolve-time +----------------- + +The amount of time we will spend trying to resolve an Appletalk address. + +aarp-retransmit-limit +--------------------- + +The number of times we will retransmit a query before giving up. + +aarp-tick-time +-------------- + +Controls the rate at which expires are checked. + +The directory /proc/net/appletalk holds the list of active Appletalk sockets +on a machine. + +The fields indicate the DDP type, the local address (in network:node format) +the remote address, the size of the transmit pending queue, the size of the +received queue (bytes waiting for applications to read) the state and the uid +owning the socket. + +/proc/net/atalk_iface lists all the interfaces configured for appletalk.It +shows the name of the interface, its Appletalk address, the network range on +that address (or network number for phase 1 networks), and the status of the +interface. + +/proc/net/atalk_route lists each known network route. It lists the target +(network) that the route leads to, the router (may be directly connected), the +route flags, and the device the route is using. + + +5. IPX +------ + +The IPX protocol has no tunable values in proc/sys/net. + +The IPX protocol does, however, provide proc/net/ipx. This lists each IPX +socket giving the local and remote addresses in Novell format (that is +network:node:port). In accordance with the strange Novell tradition, +everything but the port is in hex. Not_Connected is displayed for sockets that +are not tied to a specific remote address. The Tx and Rx queue sizes indicate +the number of bytes pending for transmission and reception. The state +indicates the state the socket is in and the uid is the owning uid of the +socket. + +The /proc/net/ipx_interface file lists all IPX interfaces. For each interface +it gives the network number, the node number, and indicates if the network is +the primary network. It also indicates which device it is bound to (or +Internal for internal networks) and the Frame Type if appropriate. Linux +supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for +IPX. + +The /proc/net/ipx_route table holds a list of IPX routes. For each route it +gives the destination network, the router node (or Directly) and the network +address of the router (or Connected) for internal networks. + +6. TIPC +------- + +tipc_rmem +--------- + +The TIPC protocol now has a tunable for the receive memory, similar to the +tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max) + +:: + + # cat /proc/sys/net/tipc/tipc_rmem + 4252725 34021800 68043600 + # + +The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values +are scaled (shifted) versions of that same value. Note that the min value +is not at this point in time used in any meaningful way, but the triplet is +preserved in order to be consistent with things like tcp_rmem. + +named_timeout +------------- + +TIPC name table updates are distributed asynchronously in a cluster, without +any form of transaction handling. This means that different race scenarios are +possible. One such is that a name withdrawal sent out by one node and received +by another node may arrive after a second, overlapping name publication already +has been accepted from a third node, although the conflicting updates +originally may have been issued in the correct sequential order. +If named_timeout is nonzero, failed topology updates will be placed on a defer +queue until another event arrives that clears the error, or until the timeout +expires. Value is in milliseconds. diff --git a/Documentation/admin-guide/sysctl/sunrpc.rst b/Documentation/admin-guide/sysctl/sunrpc.rst new file mode 100644 index 000000000000..09780a682afd --- /dev/null +++ b/Documentation/admin-guide/sysctl/sunrpc.rst @@ -0,0 +1,25 @@ +=================================== +Documentation for /proc/sys/sunrpc/ +=================================== + +kernel version 2.2.10 + +Copyright (c) 1998, 1999, Rik van Riel + +For general info and legal blurb, please look in index.rst. + +------------------------------------------------------------------------------ + +This file contains the documentation for the sysctl files in +/proc/sys/sunrpc and is valid for Linux kernel version 2.2. + +The files in this directory can be used to (re)set the debug +flags of the SUN Remote Procedure Call (RPC) subsystem in +the Linux kernel. This stuff is used for NFS, KNFSD and +maybe a few other things as well. + +The files in there are used to control the debugging flags: +rpc_debug, nfs_debug, nfsd_debug and nlm_debug. + +These flags are for kernel hackers only. You should read the +source code in net/sunrpc/ for more information. diff --git a/Documentation/admin-guide/sysctl/user.rst b/Documentation/admin-guide/sysctl/user.rst new file mode 100644 index 000000000000..650eaa03f15e --- /dev/null +++ b/Documentation/admin-guide/sysctl/user.rst @@ -0,0 +1,78 @@ +================================= +Documentation for /proc/sys/user/ +================================= + +kernel version 4.9.0 + +Copyright (c) 2016 Eric Biederman + +------------------------------------------------------------------------------ + +This file contains the documentation for the sysctl files in +/proc/sys/user. + +The files in this directory can be used to override the default +limits on the number of namespaces and other objects that have +per user per user namespace limits. + +The primary purpose of these limits is to stop programs that +malfunction and attempt to create a ridiculous number of objects, +before the malfunction becomes a system wide problem. It is the +intention that the defaults of these limits are set high enough that +no program in normal operation should run into these limits. + +The creation of per user per user namespace objects are charged to +the user in the user namespace who created the object and +verified to be below the per user limit in that user namespace. + +The creation of objects is also charged to all of the users +who created user namespaces the creation of the object happens +in (user namespaces can be nested) and verified to be below the per user +limits in the user namespaces of those users. + +This recursive counting of created objects ensures that creating a +user namespace does not allow a user to escape their current limits. + +Currently, these files are in /proc/sys/user: + +max_cgroup_namespaces +===================== + + The maximum number of cgroup namespaces that any user in the current + user namespace may create. + +max_ipc_namespaces +================== + + The maximum number of ipc namespaces that any user in the current + user namespace may create. + +max_mnt_namespaces +================== + + The maximum number of mount namespaces that any user in the current + user namespace may create. + +max_net_namespaces +================== + + The maximum number of network namespaces that any user in the + current user namespace may create. + +max_pid_namespaces +================== + + The maximum number of pid namespaces that any user in the current + user namespace may create. + +max_user_namespaces +=================== + + The maximum number of user namespaces that any user in the current + user namespace may create. + +max_uts_namespaces +================== + + The maximum number of user namespaces that any user in the current + user namespace may create. diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst new file mode 100644 index 000000000000..5aceb5cd5ce7 --- /dev/null +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -0,0 +1,964 @@ +=============================== +Documentation for /proc/sys/vm/ +=============================== + +kernel version 2.6.29 + +Copyright (c) 1998, 1999, Rik van Riel + +Copyright (c) 2008 Peter W. Morreale + +For general info and legal blurb, please look in index.rst. + +------------------------------------------------------------------------------ + +This file contains the documentation for the sysctl files in +/proc/sys/vm and is valid for Linux kernel version 2.6.29. + +The files in this directory can be used to tune the operation +of the virtual memory (VM) subsystem of the Linux kernel and +the writeout of dirty data to disk. + +Default values and initialization routines for most of these +files can be found in mm/swap.c. + +Currently, these files are in /proc/sys/vm: + +- admin_reserve_kbytes +- block_dump +- compact_memory +- compact_unevictable_allowed +- dirty_background_bytes +- dirty_background_ratio +- dirty_bytes +- dirty_expire_centisecs +- dirty_ratio +- dirtytime_expire_seconds +- dirty_writeback_centisecs +- drop_caches +- extfrag_threshold +- hugetlb_shm_group +- laptop_mode +- legacy_va_layout +- lowmem_reserve_ratio +- max_map_count +- memory_failure_early_kill +- memory_failure_recovery +- min_free_kbytes +- min_slab_ratio +- min_unmapped_ratio +- mmap_min_addr +- mmap_rnd_bits +- mmap_rnd_compat_bits +- nr_hugepages +- nr_hugepages_mempolicy +- nr_overcommit_hugepages +- nr_trim_pages (only if CONFIG_MMU=n) +- numa_zonelist_order +- oom_dump_tasks +- oom_kill_allocating_task +- overcommit_kbytes +- overcommit_memory +- overcommit_ratio +- page-cluster +- panic_on_oom +- percpu_pagelist_fraction +- stat_interval +- stat_refresh +- numa_stat +- swappiness +- unprivileged_userfaultfd +- user_reserve_kbytes +- vfs_cache_pressure +- watermark_boost_factor +- watermark_scale_factor +- zone_reclaim_mode + + +admin_reserve_kbytes +==================== + +The amount of free memory in the system that should be reserved for users +with the capability cap_sys_admin. + +admin_reserve_kbytes defaults to min(3% of free pages, 8MB) + +That should provide enough for the admin to log in and kill a process, +if necessary, under the default overcommit 'guess' mode. + +Systems running under overcommit 'never' should increase this to account +for the full Virtual Memory Size of programs used to recover. Otherwise, +root may not be able to log in to recover the system. + +How do you calculate a minimum useful reserve? + +sshd or login + bash (or some other shell) + top (or ps, kill, etc.) + +For overcommit 'guess', we can sum resident set sizes (RSS). +On x86_64 this is about 8MB. + +For overcommit 'never', we can take the max of their virtual sizes (VSZ) +and add the sum of their RSS. +On x86_64 this is about 128MB. + +Changing this takes effect whenever an application requests memory. + + +block_dump +========== + +block_dump enables block I/O debugging when set to a nonzero value. More +information on block I/O debugging is in Documentation/laptops/laptop-mode.rst. + + +compact_memory +============== + +Available only when CONFIG_COMPACTION is set. When 1 is written to the file, +all zones are compacted such that free memory is available in contiguous +blocks where possible. This can be important for example in the allocation of +huge pages although processes will also directly compact memory as required. + + +compact_unevictable_allowed +=========================== + +Available only when CONFIG_COMPACTION is set. When set to 1, compaction is +allowed to examine the unevictable lru (mlocked pages) for pages to compact. +This should be used on systems where stalls for minor page faults are an +acceptable trade for large contiguous free memory. Set to 0 to prevent +compaction from moving pages that are unevictable. Default value is 1. + + +dirty_background_bytes +====================== + +Contains the amount of dirty memory at which the background kernel +flusher threads will start writeback. + +Note: + dirty_background_bytes is the counterpart of dirty_background_ratio. Only + one of them may be specified at a time. When one sysctl is written it is + immediately taken into account to evaluate the dirty memory limits and the + other appears as 0 when read. + + +dirty_background_ratio +====================== + +Contains, as a percentage of total available memory that contains free pages +and reclaimable pages, the number of pages at which the background kernel +flusher threads will start writing out dirty data. + +The total available memory is not equal to total system memory. + + +dirty_bytes +=========== + +Contains the amount of dirty memory at which a process generating disk writes +will itself start writeback. + +Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be +specified at a time. When one sysctl is written it is immediately taken into +account to evaluate the dirty memory limits and the other appears as 0 when +read. + +Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any +value lower than this limit will be ignored and the old configuration will be +retained. + + +dirty_expire_centisecs +====================== + +This tunable is used to define when dirty data is old enough to be eligible +for writeout by the kernel flusher threads. It is expressed in 100'ths +of a second. Data which has been dirty in-memory for longer than this +interval will be written out next time a flusher thread wakes up. + + +dirty_ratio +=========== + +Contains, as a percentage of total available memory that contains free pages +and reclaimable pages, the number of pages at which a process which is +generating disk writes will itself start writing out dirty data. + +The total available memory is not equal to total system memory. + + +dirtytime_expire_seconds +======================== + +When a lazytime inode is constantly having its pages dirtied, the inode with +an updated timestamp will never get chance to be written out. And, if the +only thing that has happened on the file system is a dirtytime inode caused +by an atime update, a worker will be scheduled to make sure that inode +eventually gets pushed out to disk. This tunable is used to define when dirty +inode is old enough to be eligible for writeback by the kernel flusher threads. +And, it is also used as the interval to wakeup dirtytime_writeback thread. + + +dirty_writeback_centisecs +========================= + +The kernel flusher threads will periodically wake up and write `old` data +out to disk. This tunable expresses the interval between those wakeups, in +100'ths of a second. + +Setting this to zero disables periodic writeback altogether. + + +drop_caches +=========== + +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. + +To free pagecache:: + + echo 1 > /proc/sys/vm/drop_caches + +To free reclaimable slab objects (includes dentries and inodes):: + + echo 2 > /proc/sys/vm/drop_caches + +To free slab objects and pagecache:: + + echo 3 > /proc/sys/vm/drop_caches + +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used:: + + cat (1234): drop_caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. To disable them, echo 4 (bit 2) into drop_caches. + + +extfrag_threshold +================= + +This parameter affects whether the kernel will compact memory or direct +reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in +debugfs shows what the fragmentation index for each order is in each zone in +the system. Values tending towards 0 imply allocations would fail due to lack +of memory, values towards 1000 imply failures are due to fragmentation and -1 +implies that the allocation will succeed as long as watermarks are met. + +The kernel will not compact memory in a zone if the +fragmentation index is <= extfrag_threshold. The default value is 500. + + +highmem_is_dirtyable +==================== + +Available only for systems with CONFIG_HIGHMEM enabled (32b systems). + +This parameter controls whether the high memory is considered for dirty +writers throttling. This is not the case by default which means that +only the amount of memory directly visible/usable by the kernel can +be dirtied. As a result, on systems with a large amount of memory and +lowmem basically depleted writers might be throttled too early and +streaming writes can get very slow. + +Changing the value to non zero would allow more memory to be dirtied +and thus allow writers to write more data which can be flushed to the +storage more effectively. Note this also comes with a risk of pre-mature +OOM killer because some writers (e.g. direct block device writes) can +only use the low memory and they can fill it up with dirty data without +any throttling. + + +hugetlb_shm_group +================= + +hugetlb_shm_group contains group id that is allowed to create SysV +shared memory segment using hugetlb page. + + +laptop_mode +=========== + +laptop_mode is a knob that controls "laptop mode". All the things that are +controlled by this knob are discussed in Documentation/laptops/laptop-mode.rst. + + +legacy_va_layout +================ + +If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel +will use the legacy (2.4) layout for all processes. + + +lowmem_reserve_ratio +==================== + +For some specialised workloads on highmem machines it is dangerous for +the kernel to allow process memory to be allocated from the "lowmem" +zone. This is because that memory could then be pinned via the mlock() +system call, or by unavailability of swapspace. + +And on large highmem machines this lack of reclaimable lowmem memory +can be fatal. + +So the Linux page allocator has a mechanism which prevents allocations +which *could* use highmem from using too much lowmem. This means that +a certain amount of lowmem is defended from the possibility of being +captured into pinned user memory. + +(The same argument applies to the old 16 megabyte ISA DMA region. This +mechanism will also defend that region from allocations which could use +highmem or lowmem). + +The `lowmem_reserve_ratio` tunable determines how aggressive the kernel is +in defending these lower zones. + +If you have a machine which uses highmem or ISA DMA and your +applications are using mlock(), or if you are running with no swap then +you probably should change the lowmem_reserve_ratio setting. + +The lowmem_reserve_ratio is an array. You can see them by reading this file:: + + % cat /proc/sys/vm/lowmem_reserve_ratio + 256 256 32 + +But, these values are not used directly. The kernel calculates # of protection +pages for each zones from them. These are shown as array of protection pages +in /proc/zoneinfo like followings. (This is an example of x86-64 box). +Each zone has an array of protection pages like this:: + + Node 0, zone DMA + pages free 1355 + min 3 + low 3 + high 4 + : + : + numa_other 0 + protection: (0, 2004, 2004, 2004) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + pagesets + cpu: 0 pcp: 0 + : + +These protections are added to score to judge whether this zone should be used +for page allocation or should be reclaimed. + +In this example, if normal pages (index=2) are required to this DMA zone and +watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should +not be used because pages_free(1355) is smaller than watermark + protection[2] +(4 + 2004 = 2008). If this protection value is 0, this zone would be used for +normal page requirement. If requirement is DMA zone(index=0), protection[0] +(=0) is used. + +zone[i]'s protection[j] is calculated by following expression:: + + (i < j): + zone[i]->protection[j] + = (total sums of managed_pages from zone[i+1] to zone[j] on the node) + / lowmem_reserve_ratio[i]; + (i = j): + (should not be protected. = 0; + (i > j): + (not necessary, but looks 0) + +The default values of lowmem_reserve_ratio[i] are + + === ==================================== + 256 (if zone[i] means DMA or DMA32 zone) + 32 (others) + === ==================================== + +As above expression, they are reciprocal number of ratio. +256 means 1/256. # of protection pages becomes about "0.39%" of total managed +pages of higher zones on the node. + +If you would like to protect more pages, smaller values are effective. +The minimum value is 1 (1/1 -> 100%). The value less than 1 completely +disables protection of the pages. + + +max_map_count: +============== + +This file contains the maximum number of memory map areas a process +may have. Memory map areas are used as a side-effect of calling +malloc, directly by mmap, mprotect, and madvise, and also when loading +shared libraries. + +While most applications need less than a thousand maps, certain +programs, particularly malloc debuggers, may consume lots of them, +e.g., up to one or two maps per allocation. + +The default value is 65536. + + +memory_failure_early_kill: +========================== + +Control how to kill processes when uncorrected memory error (typically +a 2bit error in a memory module) is detected in the background by hardware +that cannot be handled by the kernel. In some cases (like the page +still having a valid copy on disk) the kernel will handle the failure +transparently without affecting any applications. But if there is +no other uptodate copy of the data it will kill to prevent any data +corruptions from propagating. + +1: Kill all processes that have the corrupted and not reloadable page mapped +as soon as the corruption is detected. Note this is not supported +for a few types of pages, like kernel internally allocated data or +the swap cache, but works for the majority of user pages. + +0: Only unmap the corrupted page from all processes and only kill a process +who tries to access it. + +The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can +handle this if they want to. + +This is only active on architectures/platforms with advanced machine +check handling and depends on the hardware capabilities. + +Applications can override this setting individually with the PR_MCE_KILL prctl + + +memory_failure_recovery +======================= + +Enable memory failure recovery (when supported by the platform) + +1: Attempt recovery. + +0: Always panic on a memory failure. + + +min_free_kbytes +=============== + +This is used to force the Linux VM to keep a minimum number +of kilobytes free. The VM uses this number to compute a +watermark[WMARK_MIN] value for each lowmem zone in the system. +Each lowmem zone gets a number of reserved free pages based +proportionally on its size. + +Some minimal amount of memory is needed to satisfy PF_MEMALLOC +allocations; if you set this to lower than 1024KB, your system will +become subtly broken, and prone to deadlock under high loads. + +Setting this too high will OOM your machine instantly. + + +min_slab_ratio +============== + +This is available only on NUMA kernels. + +A percentage of the total pages in each zone. On Zone reclaim +(fallback from the local zone occurs) slabs will be reclaimed if more +than this percentage of pages in a zone are reclaimable slab pages. +This insures that the slab growth stays under control even in NUMA +systems that rarely perform global reclaim. + +The default is 5 percent. + +Note that slab reclaim is triggered in a per zone / node fashion. +The process of reclaiming slab memory is currently not node specific +and may not be fast. + + +min_unmapped_ratio +================== + +This is available only on NUMA kernels. + +This is a percentage of the total pages in each zone. Zone reclaim will +only occur if more than this percentage of pages are in a state that +zone_reclaim_mode allows to be reclaimed. + +If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared +against all file-backed unmapped pages including swapcache pages and tmpfs +files. Otherwise, only unmapped pages backed by normal files but not tmpfs +files and similar are considered. + +The default is 1 percent. + + +mmap_min_addr +============= + +This file indicates the amount of address space which a user process will +be restricted from mmapping. Since kernel null dereference bugs could +accidentally operate based on the information in the first couple of pages +of memory userspace processes should not be allowed to write to them. By +default this value is set to 0 and no protections will be enforced by the +security module. Setting this value to something like 64k will allow the +vast majority of applications to work correctly and provide defense in depth +against future potential kernel bugs. + + +mmap_rnd_bits +============= + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations on architectures which support +tuning address space randomization. This value will be bounded +by the architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_bits tunable + + +mmap_rnd_compat_bits +==================== + +This value can be used to select the number of bits to use to +determine the random offset to the base address of vma regions +resulting from mmap allocations for applications run in +compatibility mode on architectures which support tuning address +space randomization. This value will be bounded by the +architecture's minimum and maximum supported values. + +This value can be changed after boot using the +/proc/sys/vm/mmap_rnd_compat_bits tunable + + +nr_hugepages +============ + +Change the minimum size of the hugepage pool. + +See Documentation/admin-guide/mm/hugetlbpage.rst + + +nr_hugepages_mempolicy +====================== + +Change the size of the hugepage pool at run-time on a specific +set of NUMA nodes. + +See Documentation/admin-guide/mm/hugetlbpage.rst + + +nr_overcommit_hugepages +======================= + +Change the maximum size of the hugepage pool. The maximum is +nr_hugepages + nr_overcommit_hugepages. + +See Documentation/admin-guide/mm/hugetlbpage.rst + + +nr_trim_pages +============= + +This is available only on NOMMU kernels. + +This value adjusts the excess page trimming behaviour of power-of-2 aligned +NOMMU mmap allocations. + +A value of 0 disables trimming of allocations entirely, while a value of 1 +trims excess pages aggressively. Any value >= 1 acts as the watermark where +trimming of allocations is initiated. + +The default value is 1. + +See Documentation/nommu-mmap.txt for more information. + + +numa_zonelist_order +=================== + +This sysctl is only for NUMA and it is deprecated. Anything but +Node order will fail! + +'where the memory is allocated from' is controlled by zonelists. + +(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. +you may be able to read ZONE_DMA as ZONE_DMA32...) + +In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following. +ZONE_NORMAL -> ZONE_DMA +This means that a memory allocation request for GFP_KERNEL will +get memory from ZONE_DMA only when ZONE_NORMAL is not available. + +In NUMA case, you can think of following 2 types of order. +Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL:: + + (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL + (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA. + +Type(A) offers the best locality for processes on Node(0), but ZONE_DMA +will be used before ZONE_NORMAL exhaustion. This increases possibility of +out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small. + +Type(B) cannot offer the best locality but is more robust against OOM of +the DMA zone. + +Type(A) is called as "Node" order. Type (B) is "Zone" order. + +"Node order" orders the zonelists by node, then by zone within each node. +Specify "[Nn]ode" for node order + +"Zone Order" orders the zonelists by zone type, then by node within each +zone. Specify "[Zz]one" for zone order. + +Specify "[Dd]efault" to request automatic configuration. + +On 32-bit, the Normal zone needs to be preserved for allocations accessible +by the kernel, so "zone" order will be selected. + +On 64-bit, devices that require DMA32/DMA are relatively rare, so "node" +order will be selected. + +Default order is recommended unless this is causing problems for your +system/application. + + +oom_dump_tasks +============== + +Enables a system-wide task dump (excluding kernel threads) to be produced +when the kernel performs an OOM-killing and includes such information as +pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. + +If this is set to zero, this information is suppressed. On very +large systems with thousands of tasks it may not be feasible to dump +the memory state information for each one. Such systems should not +be forced to incur a performance penalty in OOM conditions when the +information may not be desired. + +If this is set to non-zero, this information is shown whenever the +OOM killer actually kills a memory-hogging task. + +The default value is 1 (enabled). + + +oom_kill_allocating_task +======================== + +This enables or disables killing the OOM-triggering task in +out-of-memory situations. + +If this is set to zero, the OOM killer will scan through the entire +tasklist and select a task based on heuristics to kill. This normally +selects a rogue memory-hogging task that frees up a large amount of +memory when killed. + +If this is set to non-zero, the OOM killer simply kills the task that +triggered the out-of-memory condition. This avoids the expensive +tasklist scan. + +If panic_on_oom is selected, it takes precedence over whatever value +is used in oom_kill_allocating_task. + +The default value is 0. + + +overcommit_kbytes +================= + +When overcommit_memory is set to 2, the committed address space is not +permitted to exceed swap plus this amount of physical RAM. See below. + +Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one +of them may be specified at a time. Setting one disables the other (which +then appears as 0 when read). + + +overcommit_memory +================= + +This value contains a flag that enables memory overcommitment. + +When this flag is 0, the kernel attempts to estimate the amount +of free memory left when userspace requests more memory. + +When this flag is 1, the kernel pretends there is always enough +memory until it actually runs out. + +When this flag is 2, the kernel uses a "never overcommit" +policy that attempts to prevent any overcommit of memory. +Note that user_reserve_kbytes affects this policy. + +This feature can be very useful because there are a lot of +programs that malloc() huge amounts of memory "just-in-case" +and don't use much of it. + +The default value is 0. + +See Documentation/vm/overcommit-accounting.rst and +mm/util.c::__vm_enough_memory() for more information. + + +overcommit_ratio +================ + +When overcommit_memory is set to 2, the committed address +space is not permitted to exceed swap plus this percentage +of physical RAM. See above. + + +page-cluster +============ + +page-cluster controls the number of pages up to which consecutive pages +are read in from swap in a single attempt. This is the swap counterpart +to page cache readahead. +The mentioned consecutivity is not in terms of virtual/physical addresses, +but consecutive on swap space - that means they were swapped out together. + +It is a logarithmic value - setting it to zero means "1 page", setting +it to 1 means "2 pages", setting it to 2 means "4 pages", etc. +Zero disables swap readahead completely. + +The default value is three (eight pages at a time). There may be some +small benefits in tuning this to a different value if your workload is +swap-intensive. + +Lower values mean lower latencies for initial faults, but at the same time +extra faults and I/O delays for following faults if they would have been part of +that consecutive pages readahead would have brought in. + + +panic_on_oom +============ + +This enables or disables panic on out-of-memory feature. + +If this is set to 0, the kernel will kill some rogue process, +called oom_killer. Usually, oom_killer can kill rogue processes and +system will survive. + +If this is set to 1, the kernel panics when out-of-memory happens. +However, if a process limits using nodes by mempolicy/cpusets, +and those nodes become memory exhaustion status, one process +may be killed by oom-killer. No panic occurs in this case. +Because other nodes' memory may be free. This means system total status +may be not fatal yet. + +If this is set to 2, the kernel panics compulsorily even on the +above-mentioned. Even oom happens under memory cgroup, the whole +system panics. + +The default value is 0. + +1 and 2 are for failover of clustering. Please select either +according to your policy of failover. + +panic_on_oom=2+kdump gives you very strong tool to investigate +why oom happens. You can get snapshot. + + +percpu_pagelist_fraction +======================== + +This is the fraction of pages at most (high mark pcp->high) in each zone that +are allocated for each per cpu page list. The min value for this is 8. It +means that we don't allow more than 1/8th of pages in each zone to be +allocated in any single per_cpu_pagelist. This entry only changes the value +of hot per cpu pagelists. User can specify a number like 100 to allocate +1/100th of each zone to each per cpu page list. + +The batch value of each per cpu pagelist is also updated as a result. It is +set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) + +The initial value is zero. Kernel does not use this value at boot time to set +the high water marks for each per cpu page list. If the user writes '0' to this +sysctl, it will revert to this default behavior. + + +stat_interval +============= + +The time interval between which vm statistics are updated. The default +is 1 second. + + +stat_refresh +============ + +Any read or write (by root only) flushes all the per-cpu vm statistics +into their global totals, for more accurate reports when testing +e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo + +As a side-effect, it also checks for negative totals (elsewhere reported +as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. +(At time of writing, a few stats are known sometimes to be found negative, +with no ill effects: errors and warnings on these stats are suppressed.) + + +numa_stat +========= + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do:: + + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do:: + + echo 1 > /proc/sys/vm/numa_stat + + +swappiness +========== + +This control is used to define how aggressive the kernel will swap +memory pages. Higher values will increase aggressiveness, lower values +decrease the amount of swap. A value of 0 instructs the kernel not to +initiate swap until the amount of free and file-backed pages is less +than the high water mark in a zone. + +The default value is 60. + + +unprivileged_userfaultfd +======================== + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + + +user_reserve_kbytes +=================== + +When overcommit_memory is set to 2, "never overcommit" mode, reserve +min(3% of current process size, user_reserve_kbytes) of free memory. +This is intended to prevent a user from starting a single memory hogging +process, such that they cannot recover (kill the hog). + +user_reserve_kbytes defaults to min(3% of the current process size, 128MB). + +If this is reduced to zero, then the user will be allowed to allocate +all free memory with a single process, minus admin_reserve_kbytes. +Any subsequent attempts to execute a command will result in +"fork: Cannot allocate memory". + +Changing this takes effect whenever an application requests memory. + + +vfs_cache_pressure +================== + +This percentage value controls the tendency of the kernel to reclaim +the memory which is used for caching of directory and inode objects. + +At the default value of vfs_cache_pressure=100 the kernel will attempt to +reclaim dentries and inodes at a "fair" rate with respect to pagecache and +swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer +to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will +never reclaim dentries and inodes due to memory pressure and this can easily +lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 +causes the kernel to prefer to reclaim dentries and inodes. + +Increasing vfs_cache_pressure significantly beyond 100 may have negative +performance impact. Reclaim code needs to take various locks to find freeable +directory and inode objects. With vfs_cache_pressure=1000, it will look for +ten times more freeable objects than there are. + + +watermark_boost_factor +====================== + +This factor controls the level of reclaim when memory is being fragmented. +It defines the percentage of the high watermark of a zone that will be +reclaimed if pages of different mobility are being mixed within pageblocks. +The intent is that compaction has less work to do in the future and to +increase the success rate of future high-order allocations such as SLUB +allocations, THP and hugetlbfs pages. + +To make it sensible with respect to the watermark_scale_factor +parameter, the unit is in fractions of 10,000. The default value of +15,000 on !DISCONTIGMEM configurations means that up to 150% of the high +watermark will be reclaimed in the event of a pageblock being mixed due +to fragmentation. The level of reclaim is determined by the number of +fragmentation events that occurred in the recent past. If this value is +smaller than a pageblock then a pageblocks worth of pages will be reclaimed +(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. + + +watermark_scale_factor +====================== + +This factor controls the aggressiveness of kswapd. It defines the +amount of memory left in a node/system before kswapd is woken up and +how much memory needs to be free before kswapd goes back to sleep. + +The unit is in fractions of 10,000. The default value of 10 means the +distances between watermarks are 0.1% of the available memory in the +node/system. The maximum value is 1000, or 10% of memory. + +A high rate of threads entering direct reclaim (allocstall) or kswapd +going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate +that the number of free pages kswapd maintains for latency reasons is +too small for the allocation bursts occurring in the system. This knob +can then be used to tune kswapd aggressiveness accordingly. + + +zone_reclaim_mode +================= + +Zone_reclaim_mode allows someone to set more or less aggressive approaches to +reclaim memory when a zone runs out of memory. If it is set to zero then no +zone reclaim occurs. Allocations will be satisfied from other zones / nodes +in the system. + +This is value OR'ed together of + += =================================== +1 Zone reclaim on +2 Zone reclaim writes dirty pages out +4 Zone reclaim swaps pages += =================================== + +zone_reclaim_mode is disabled by default. For file servers or workloads +that benefit from having their data cached, zone_reclaim_mode should be +left disabled as the caching effect is likely to be more important than +data locality. + +zone_reclaim may be enabled if it's known that the workload is partitioned +such that each partition fits within a NUMA node and that accessing remote +memory would cause a measurable performance reduction. The page allocator +will then reclaim easily reusable pages (those page cache pages that are +currently not used) before allocating off node pages. + +Allowing zone reclaim to write out pages stops processes that are +writing large amounts of data from dirtying pages on other nodes. Zone +reclaim will write out dirty pages if a zone fills up and so effectively +throttle the process. This may decrease the performance of a single process +since it cannot use all of system memory to buffer the outgoing writes +anymore but it preserve the memory on other nodes so that the performance +of other processes running on other nodes will not be affected. + +Allowing regular swap effectively restricts allocations to the local +node unless explicitly overridden by memory policies or cpuset +configurations. diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 1d8e748f909f..c6224d039bcb 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -119,7 +119,7 @@ Kernel Pointers For printing kernel pointers which should be hidden from unprivileged users. The behaviour of %pK depends on the kptr_restrict sysctl - see -Documentation/sysctl/kernel.rst for more details. +Documentation/admin-guide/sysctl/kernel.rst for more details. Unmodified Addresses -------------------- diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index d750b6926899..fb4735fd73b0 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1500,7 +1500,7 @@ review the kernel documentation in the directory /usr/src/linux/Documentation. This chapter is heavily based on the documentation included in the pre 2.2 kernels, and became part of it in version 2.2.1 of the Linux kernel. -Please see: Documentation/sysctl/ directory for descriptions of these +Please see: Documentation/admin-guide/sysctl/ directory for descriptions of these entries. ------------------------------------------------------------------------------ diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5c3399cde1c4..df33674799b5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -2287,7 +2287,7 @@ addr_scope_policy - INTEGER /proc/sys/net/core/* - Please see: Documentation/sysctl/net.rst for descriptions of these entries. + Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. /proc/sys/net/unix/* diff --git a/Documentation/sysctl/abi.rst b/Documentation/sysctl/abi.rst deleted file mode 100644 index 599bcde7f0b7..000000000000 --- a/Documentation/sysctl/abi.rst +++ /dev/null @@ -1,67 +0,0 @@ -================================ -Documentation for /proc/sys/abi/ -================================ - -kernel version 2.6.0.test2 - -Copyright (c) 2003, Fabian Frederick - -For general info: index.rst. - ------------------------------------------------------------------------------- - -This path is binary emulation relevant aka personality types aka abi. -When a process is executed, it's linked to an exec_domain whose -personality is defined using values available from /proc/sys/abi. -You can find further details about abi in include/linux/personality.h. - -Here are the files featuring in 2.6 kernel: - -- defhandler_coff -- defhandler_elf -- defhandler_lcall7 -- defhandler_libcso -- fake_utsname -- trace - -defhandler_coff ---------------- - -defined value: - PER_SCOSVR3:: - - 0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE - -defhandler_elf --------------- - -defined value: - PER_LINUX:: - - 0 - -defhandler_lcall7 ------------------ - -defined value : - PER_SVR4:: - - 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO, - -defhandler_libsco ------------------ - -defined value: - PER_SVR4:: - - 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO, - -fake_utsname ------------- - -Unused - -trace ------ - -Unused diff --git a/Documentation/sysctl/fs.rst b/Documentation/sysctl/fs.rst deleted file mode 100644 index 2a45119e3331..000000000000 --- a/Documentation/sysctl/fs.rst +++ /dev/null @@ -1,384 +0,0 @@ -=============================== -Documentation for /proc/sys/fs/ -=============================== - -kernel version 2.2.10 - -Copyright (c) 1998, 1999, Rik van Riel - -Copyright (c) 2009, Shen Feng - -For general info and legal blurb, please look in intro.rst. - ------------------------------------------------------------------------------- - -This file contains documentation for the sysctl files in -/proc/sys/fs/ and is valid for Linux kernel version 2.2. - -The files in this directory can be used to tune and monitor -miscellaneous and general things in the operation of the Linux -kernel. Since some of the files _can_ be used to screw up your -system, it is advisable to read both documentation and source -before actually making adjustments. - -1. /proc/sys/fs -=============== - -Currently, these files are in /proc/sys/fs: - -- aio-max-nr -- aio-nr -- dentry-state -- dquot-max -- dquot-nr -- file-max -- file-nr -- inode-max -- inode-nr -- inode-state -- nr_open -- overflowuid -- overflowgid -- pipe-user-pages-hard -- pipe-user-pages-soft -- protected_fifos -- protected_hardlinks -- protected_regular -- protected_symlinks -- suid_dumpable -- super-max -- super-nr - - -aio-nr & aio-max-nr -------------------- - -aio-nr is the running total of the number of events specified on the -io_setup system call for all currently active aio contexts. If aio-nr -reaches aio-max-nr then io_setup will fail with EAGAIN. Note that -raising aio-max-nr does not result in the pre-allocation or re-sizing -of any kernel data structures. - - -dentry-state ------------- - -From linux/include/linux/dcache.h:: - - struct dentry_stat_t dentry_stat { - int nr_dentry; - int nr_unused; - int age_limit; /* age in seconds */ - int want_pages; /* pages requested by system */ - int nr_negative; /* # of unused negative dentries */ - int dummy; /* Reserved for future use */ - }; - -Dentries are dynamically allocated and deallocated. - -nr_dentry shows the total number of dentries allocated (active -+ unused). nr_unused shows the number of dentries that are not -actively used, but are saved in the LRU list for future reuse. - -Age_limit is the age in seconds after which dcache entries -can be reclaimed when memory is short and want_pages is -nonzero when shrink_dcache_pages() has been called and the -dcache isn't pruned yet. - -nr_negative shows the number of unused dentries that are also -negative dentries which do not map to any files. Instead, -they help speeding up rejection of non-existing files provided -by the users. - - -dquot-max & dquot-nr --------------------- - -The file dquot-max shows the maximum number of cached disk -quota entries. - -The file dquot-nr shows the number of allocated disk quota -entries and the number of free disk quota entries. - -If the number of free cached disk quotas is very low and -you have some awesome number of simultaneous system users, -you might want to raise the limit. - - -file-max & file-nr ------------------- - -The value in file-max denotes the maximum number of file- -handles that the Linux kernel will allocate. When you get lots -of error messages about running out of file handles, you might -want to increase this limit. - -Historically,the kernel was able to allocate file handles -dynamically, but not to free them again. The three values in -file-nr denote the number of allocated file handles, the number -of allocated but unused file handles, and the maximum number of -file handles. Linux 2.6 always reports 0 as the number of free -file handles -- this is not an error, it just means that the -number of allocated file handles exactly matches the number of -used file handles. - -Attempts to allocate more file descriptors than file-max are -reported with printk, look for "VFS: file-max limit -reached". - - -nr_open -------- - -This denotes the maximum number of file-handles a process can -allocate. Default value is 1024*1024 (1048576) which should be -enough for most machines. Actual limit depends on RLIMIT_NOFILE -resource limit. - - -inode-max, inode-nr & inode-state ---------------------------------- - -As with file handles, the kernel allocates the inode structures -dynamically, but can't free them yet. - -The value in inode-max denotes the maximum number of inode -handlers. This value should be 3-4 times larger than the value -in file-max, since stdin, stdout and network sockets also -need an inode struct to handle them. When you regularly run -out of inodes, you need to increase this value. - -The file inode-nr contains the first two items from -inode-state, so we'll skip to that file... - -Inode-state contains three actual numbers and four dummies. -The actual numbers are, in order of appearance, nr_inodes, -nr_free_inodes and preshrink. - -Nr_inodes stands for the number of inodes the system has -allocated, this can be slightly more than inode-max because -Linux allocates them one pageful at a time. - -Nr_free_inodes represents the number of free inodes (?) and -preshrink is nonzero when the nr_inodes > inode-max and the -system needs to prune the inode list instead of allocating -more. - - -overflowgid & overflowuid -------------------------- - -Some filesystems only support 16-bit UIDs and GIDs, although in Linux -UIDs and GIDs are 32 bits. When one of these filesystems is mounted -with writes enabled, any UID or GID that would exceed 65535 is translated -to a fixed value before being written to disk. - -These sysctls allow you to change the value of the fixed UID and GID. -The default is 65534. - - -pipe-user-pages-hard --------------------- - -Maximum total number of pages a non-privileged user may allocate for pipes. -Once this limit is reached, no new pipes may be allocated until usage goes -below the limit again. When set to 0, no limit is applied, which is the default -setting. - - -pipe-user-pages-soft --------------------- - -Maximum total number of pages a non-privileged user may allocate for pipes -before the pipe size gets limited to a single page. Once this limit is reached, -new pipes will be limited to a single page in size for this user in order to -limit total memory usage, and trying to increase them using fcntl() will be -denied until usage goes below the limit again. The default value allows to -allocate up to 1024 pipes at their default size. When set to 0, no limit is -applied. - - -protected_fifos ---------------- - -The intent of this protection is to avoid unintentional writes to -an attacker-controlled FIFO, where a program expected to create a regular -file. - -When set to "0", writing to FIFOs is unrestricted. - -When set to "1" don't allow O_CREAT open on FIFOs that we don't own -in world writable sticky directories, unless they are owned by the -owner of the directory. - -When set to "2" it also applies to group writable sticky directories. - -This protection is based on the restrictions in Openwall. - - -protected_hardlinks --------------------- - -A long-standing class of security issues is the hardlink-based -time-of-check-time-of-use race, most commonly seen in world-writable -directories like /tmp. The common method of exploitation of this flaw -is to cross privilege boundaries when following a given hardlink (i.e. a -root process follows a hardlink created by another user). Additionally, -on systems without separated partitions, this stops unauthorized users -from "pinning" vulnerable setuid/setgid files against being upgraded by -the administrator, or linking to special files. - -When set to "0", hardlink creation behavior is unrestricted. - -When set to "1" hardlinks cannot be created by users if they do not -already own the source file, or do not have read/write access to it. - -This protection is based on the restrictions in Openwall and grsecurity. - - -protected_regular ------------------ - -This protection is similar to protected_fifos, but it -avoids writes to an attacker-controlled regular file, where a program -expected to create one. - -When set to "0", writing to regular files is unrestricted. - -When set to "1" don't allow O_CREAT open on regular files that we -don't own in world writable sticky directories, unless they are -owned by the owner of the directory. - -When set to "2" it also applies to group writable sticky directories. - - -protected_symlinks ------------------- - -A long-standing class of security issues is the symlink-based -time-of-check-time-of-use race, most commonly seen in world-writable -directories like /tmp. The common method of exploitation of this flaw -is to cross privilege boundaries when following a given symlink (i.e. a -root process follows a symlink belonging to another user). For a likely -incomplete list of hundreds of examples across the years, please see: -http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp - -When set to "0", symlink following behavior is unrestricted. - -When set to "1" symlinks are permitted to be followed only when outside -a sticky world-writable directory, or when the uid of the symlink and -follower match, or when the directory owner matches the symlink's owner. - -This protection is based on the restrictions in Openwall and grsecurity. - - -suid_dumpable: --------------- - -This value can be used to query and set the core dump mode for setuid -or otherwise protected/tainted binaries. The modes are - -= ========== =============================================================== -0 (default) traditional behaviour. Any process which has changed - privilege levels or is execute only will not be dumped. -1 (debug) all processes dump core when possible. The core dump is - owned by the current user and no security is applied. This is - intended for system debugging situations only. - Ptrace is unchecked. - This is insecure as it allows regular users to examine the - memory contents of privileged processes. -2 (suidsafe) any binary which normally would not be dumped is dumped - anyway, but only if the "core_pattern" kernel sysctl is set to - either a pipe handler or a fully qualified path. (For more - details on this limitation, see CVE-2006-2451.) This mode is - appropriate when administrators are attempting to debug - problems in a normal environment, and either have a core dump - pipe handler that knows to treat privileged core dumps with - care, or specific directory defined for catching core dumps. - If a core dump happens without a pipe handler or fully - qualified path, a message will be emitted to syslog warning - about the lack of a correct setting. -= ========== =============================================================== - - -super-max & super-nr --------------------- - -These numbers control the maximum number of superblocks, and -thus the maximum number of mounted filesystems the kernel -can have. You only need to increase super-max if you need to -mount more filesystems than the current value in super-max -allows you to. - - -aio-nr & aio-max-nr -------------------- - -aio-nr shows the current system-wide number of asynchronous io -requests. aio-max-nr allows you to change the maximum value -aio-nr can grow to. - - -mount-max ---------- - -This denotes the maximum number of mounts that may exist -in a mount namespace. - - - -2. /proc/sys/fs/binfmt_misc -=========================== - -Documentation for the files in /proc/sys/fs/binfmt_misc is -in Documentation/admin-guide/binfmt-misc.rst. - - -3. /proc/sys/fs/mqueue - POSIX message queues filesystem -======================================================== - - -The "mqueue" filesystem provides the necessary kernel features to enable the -creation of a user space library that implements the POSIX message queues -API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System -Interfaces specification.) - -The "mqueue" filesystem contains values for determining/setting the amount of -resources used by the file system. - -/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the -maximum number of message queues allowed on the system. - -/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the -maximum number of messages in a queue value. In fact it is the limiting value -for another (user) limit which is set in mq_open invocation. This attribute of -a queue must be less or equal then msg_max. - -/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the -maximum message size value (it is every message queue's attribute set during -its creation). - -/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the -default number of messages in a queue value if attr parameter of mq_open(2) is -NULL. If it exceed msg_max, the default value is initialized msg_max. - -/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting -the default message size value if attr parameter of mq_open(2) is NULL. If it -exceed msgsize_max, the default value is initialized msgsize_max. - -4. /proc/sys/fs/epoll - Configuration options for the epoll interface -===================================================================== - -This directory contains configuration options for the epoll(7) interface. - -max_user_watches ----------------- - -Every epoll file descriptor can store a number of files to be monitored -for event readiness. Each one of these monitored files constitutes a "watch". -This configuration option sets the maximum number of "watches" that are -allowed for each user. -Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes -on a 64bit one. -The current default value for max_user_watches is the 1/32 of the available -low memory, divided for the "watch" cost in bytes. diff --git a/Documentation/sysctl/index.rst b/Documentation/sysctl/index.rst deleted file mode 100644 index efbcde8c1c9c..000000000000 --- a/Documentation/sysctl/index.rst +++ /dev/null @@ -1,100 +0,0 @@ -:orphan: - -=========================== -Documentation for /proc/sys -=========================== - -Copyright (c) 1998, 1999, Rik van Riel - ------------------------------------------------------------------------------- - -'Why', I hear you ask, 'would anyone even _want_ documentation -for them sysctl files? If anybody really needs it, it's all in -the source...' - -Well, this documentation is written because some people either -don't know they need to tweak something, or because they don't -have the time or knowledge to read the source code. - -Furthermore, the programmers who built sysctl have built it to -be actually used, not just for the fun of programming it :-) - ------------------------------------------------------------------------------- - -Legal blurb: - -As usual, there are two main things to consider: - -1. you get what you pay for -2. it's free - -The consequences are that I won't guarantee the correctness of -this document, and if you come to me complaining about how you -screwed up your system because of wrong documentation, I won't -feel sorry for you. I might even laugh at you... - -But of course, if you _do_ manage to screw up your system using -only the sysctl options used in this file, I'd like to hear of -it. Not only to have a great laugh, but also to make sure that -you're the last RTFMing person to screw up. - -In short, e-mail your suggestions, corrections and / or horror -stories to: - -Rik van Riel. - --------------------------------------------------------------- - -Introduction -============ - -Sysctl is a means of configuring certain aspects of the kernel -at run-time, and the /proc/sys/ directory is there so that you -don't even need special tools to do it! -In fact, there are only four things needed to use these config -facilities: - -- a running Linux system -- root access -- common sense (this is especially hard to come by these days) -- knowledge of what all those values mean - -As a quick 'ls /proc/sys' will show, the directory consists of -several (arch-dependent?) subdirs. Each subdir is mainly about -one part of the kernel, so you can do configuration on a piece -by piece basis, or just some 'thematic frobbing'. - -This documentation is about: - -=============== =============================================================== -abi/ execution domains & personalities -debug/ -dev/ device specific information (eg dev/cdrom/info) -fs/ specific filesystems - filehandle, inode, dentry and quota tuning - binfmt_misc -kernel/ global kernel info / tuning - miscellaneous stuff -net/ networking stuff, for documentation look in: - -proc/ -sunrpc/ SUN Remote Procedure Call (NFS) -vm/ memory management tuning - buffer and cache management -user/ Per user per user namespace limits -=============== =============================================================== - -These are the subdirs I have on my system. There might be more -or other subdirs in another setup. If you see another dir, I'd -really like to hear about it :-) - -.. toctree:: - :maxdepth: 1 - - abi - fs - kernel - net - sunrpc - user - vm diff --git a/Documentation/sysctl/kernel.rst b/Documentation/sysctl/kernel.rst deleted file mode 100644 index a0c1d4ce403a..000000000000 --- a/Documentation/sysctl/kernel.rst +++ /dev/null @@ -1,1177 +0,0 @@ -=================================== -Documentation for /proc/sys/kernel/ -=================================== - -kernel version 2.2.10 - -Copyright (c) 1998, 1999, Rik van Riel - -Copyright (c) 2009, Shen Feng - -For general info and legal blurb, please look in index.rst. - ------------------------------------------------------------------------------- - -This file contains documentation for the sysctl files in -/proc/sys/kernel/ and is valid for Linux kernel version 2.2. - -The files in this directory can be used to tune and monitor -miscellaneous and general things in the operation of the Linux -kernel. Since some of the files _can_ be used to screw up your -system, it is advisable to read both documentation and source -before actually making adjustments. - -Currently, these files might (depending on your configuration) -show up in /proc/sys/kernel: - -- acct -- acpi_video_flags -- auto_msgmni -- bootloader_type [ X86 only ] -- bootloader_version [ X86 only ] -- cap_last_cap -- core_pattern -- core_pipe_limit -- core_uses_pid -- ctrl-alt-del -- dmesg_restrict -- domainname -- hostname -- hotplug -- hardlockup_all_cpu_backtrace -- hardlockup_panic -- hung_task_panic -- hung_task_check_count -- hung_task_timeout_secs -- hung_task_check_interval_secs -- hung_task_warnings -- hyperv_record_panic_msg -- kexec_load_disabled -- kptr_restrict -- l2cr [ PPC only ] -- modprobe ==> Documentation/debugging-modules.txt -- modules_disabled -- msg_next_id [ sysv ipc ] -- msgmax -- msgmnb -- msgmni -- nmi_watchdog -- osrelease -- ostype -- overflowgid -- overflowuid -- panic -- panic_on_oops -- panic_on_stackoverflow -- panic_on_unrecovered_nmi -- panic_on_warn -- panic_print -- panic_on_rcu_stall -- perf_cpu_time_max_percent -- perf_event_paranoid -- perf_event_max_stack -- perf_event_mlock_kb -- perf_event_max_contexts_per_stack -- pid_max -- powersave-nap [ PPC only ] -- printk -- printk_delay -- printk_ratelimit -- printk_ratelimit_burst -- pty ==> Documentation/filesystems/devpts.txt -- randomize_va_space -- real-root-dev ==> Documentation/admin-guide/initrd.rst -- reboot-cmd [ SPARC only ] -- rtsig-max -- rtsig-nr -- sched_energy_aware -- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst -- sem -- sem_next_id [ sysv ipc ] -- sg-big-buff [ generic SCSI device (sg) ] -- shm_next_id [ sysv ipc ] -- shm_rmid_forced -- shmall -- shmmax [ sysv ipc ] -- shmmni -- softlockup_all_cpu_backtrace -- soft_watchdog -- stack_erasing -- stop-a [ SPARC only ] -- sysrq ==> Documentation/admin-guide/sysrq.rst -- sysctl_writes_strict -- tainted ==> Documentation/admin-guide/tainted-kernels.rst -- threads-max -- unknown_nmi_panic -- watchdog -- watchdog_thresh -- version - - -acct: -===== - -highwater lowwater frequency - -If BSD-style process accounting is enabled these values control -its behaviour. If free space on filesystem where the log lives -goes below % accounting suspends. If free space gets -above % accounting resumes. determines -how often do we check the amount of free space (value is in -seconds). Default: -4 2 30 -That is, suspend accounting if there left <= 2% free; resume it -if we got >=4%; consider information about amount of free space -valid for 30 seconds. - - -acpi_video_flags: -================= - -flags - -See Doc*/kernel/power/video.txt, it allows mode of video boot to be -set during run time. - - -auto_msgmni: -============ - -This variable has no effect and may be removed in future kernel -releases. Reading it always returns 0. -Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni -upon memory add/remove or upon ipc namespace creation/removal. -Echoing "1" into this file enabled msgmni automatic recomputing. -Echoing "0" turned it off. auto_msgmni default value was 1. - - -bootloader_type: -================ - -x86 bootloader identification - -This gives the bootloader type number as indicated by the bootloader, -shifted left by 4, and OR'd with the low four bits of the bootloader -version. The reason for this encoding is that this used to match the -type_of_loader field in the kernel header; the encoding is kept for -backwards compatibility. That is, if the full bootloader type number -is 0x15 and the full version number is 0x234, this file will contain -the value 340 = 0x154. - -See the type_of_loader and ext_loader_type fields in -Documentation/x86/boot.rst for additional information. - - -bootloader_version: -=================== - -x86 bootloader version - -The complete bootloader version number. In the example above, this -file will contain the value 564 = 0x234. - -See the type_of_loader and ext_loader_ver fields in -Documentation/x86/boot.rst for additional information. - - -cap_last_cap: -============= - -Highest valid capability of the running kernel. Exports -CAP_LAST_CAP from the kernel. - - -core_pattern: -============= - -core_pattern is used to specify a core dumpfile pattern name. - -* max length 127 characters; default value is "core" -* core_pattern is used as a pattern template for the output filename; - certain string patterns (beginning with '%') are substituted with - their actual values. -* backward compatibility with core_uses_pid: - - If core_pattern does not include "%p" (default does not) - and core_uses_pid is set, then .PID will be appended to - the filename. - -* corename format specifiers:: - - % '%' is dropped - %% output one '%' - %p pid - %P global pid (init PID namespace) - %i tid - %I global tid (init PID namespace) - %u uid (in initial user namespace) - %g gid (in initial user namespace) - %d dump mode, matches PR_SET_DUMPABLE and - /proc/sys/fs/suid_dumpable - %s signal number - %t UNIX time of dump - %h hostname - %e executable filename (may be shortened) - %E executable path - % both are dropped - -* If the first character of the pattern is a '|', the kernel will treat - the rest of the pattern as a command to run. The core dump will be - written to the standard input of that program instead of to a file. - - -core_pipe_limit: -================ - -This sysctl is only applicable when core_pattern is configured to pipe -core files to a user space helper (when the first character of -core_pattern is a '|', see above). When collecting cores via a pipe -to an application, it is occasionally useful for the collecting -application to gather data about the crashing process from its -/proc/pid directory. In order to do this safely, the kernel must wait -for the collecting process to exit, so as not to remove the crashing -processes proc files prematurely. This in turn creates the -possibility that a misbehaving userspace collecting process can block -the reaping of a crashed process simply by never exiting. This sysctl -defends against that. It defines how many concurrent crashing -processes may be piped to user space applications in parallel. If -this value is exceeded, then those crashing processes above that value -are noted via the kernel log and their cores are skipped. 0 is a -special value, indicating that unlimited processes may be captured in -parallel, but that no waiting will take place (i.e. the collecting -process is not guaranteed access to /proc//). This -value defaults to 0. - - -core_uses_pid: -============== - -The default coredump filename is "core". By setting -core_uses_pid to 1, the coredump filename becomes core.PID. -If core_pattern does not include "%p" (default does not) -and core_uses_pid is set, then .PID will be appended to -the filename. - - -ctrl-alt-del: -============= - -When the value in this file is 0, ctrl-alt-del is trapped and -sent to the init(1) program to handle a graceful restart. -When, however, the value is > 0, Linux's reaction to a Vulcan -Nerve Pinch (tm) will be an immediate reboot, without even -syncing its dirty buffers. - -Note: - when a program (like dosemu) has the keyboard in 'raw' - mode, the ctrl-alt-del is intercepted by the program before it - ever reaches the kernel tty layer, and it's up to the program - to decide what to do with it. - - -dmesg_restrict: -=============== - -This toggle indicates whether unprivileged users are prevented -from using dmesg(8) to view messages from the kernel's log buffer. -When dmesg_restrict is set to (0) there are no restrictions. When -dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use -dmesg(8). - -The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the -default value of dmesg_restrict. - - -domainname & hostname: -====================== - -These files can be used to set the NIS/YP domainname and the -hostname of your box in exactly the same way as the commands -domainname and hostname, i.e.:: - - # echo "darkstar" > /proc/sys/kernel/hostname - # echo "mydomain" > /proc/sys/kernel/domainname - -has the same effect as:: - - # hostname "darkstar" - # domainname "mydomain" - -Note, however, that the classic darkstar.frop.org has the -hostname "darkstar" and DNS (Internet Domain Name Server) -domainname "frop.org", not to be confused with the NIS (Network -Information Service) or YP (Yellow Pages) domainname. These two -domain names are in general different. For a detailed discussion -see the hostname(1) man page. - - -hardlockup_all_cpu_backtrace: -============================= - -This value controls the hard lockup detector behavior when a hard -lockup condition is detected as to whether or not to gather further -debug information. If enabled, arch-specific all-CPU stack dumping -will be initiated. - -0: do nothing. This is the default behavior. - -1: on detection capture more debug information. - - -hardlockup_panic: -================= - -This parameter can be used to control whether the kernel panics -when a hard lockup is detected. - - 0 - don't panic on hard lockup - 1 - panic on hard lockup - -See Documentation/lockup-watchdogs.txt for more information. This can -also be set using the nmi_watchdog kernel parameter. - - -hotplug: -======== - -Path for the hotplug policy agent. -Default value is "/sbin/hotplug". - - -hung_task_panic: -================ - -Controls the kernel's behavior when a hung task is detected. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - -0: continue operation. This is the default behavior. - -1: panic immediately. - - -hung_task_check_count: -====================== - -The upper bound on the number of tasks that are checked. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - - -hung_task_timeout_secs: -======================= - -When a task in D state did not get scheduled -for more than this value report a warning. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - -0: means infinite timeout - no checking done. - -Possible values to set are in range {0..LONG_MAX/HZ}. - - -hung_task_check_interval_secs: -============================== - -Hung task check interval. If hung task checking is enabled -(see hung_task_timeout_secs), the check is done every -hung_task_check_interval_secs seconds. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - -0 (default): means use hung_task_timeout_secs as checking interval. -Possible values to set are in range {0..LONG_MAX/HZ}. - - -hung_task_warnings: -=================== - -The maximum number of warnings to report. During a check interval -if a hung task is detected, this value is decreased by 1. -When this value reaches 0, no more warnings will be reported. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - --1: report an infinite number of warnings. - - -hyperv_record_panic_msg: -======================== - -Controls whether the panic kmsg data should be reported to Hyper-V. - -0: do not report panic kmsg data. - -1: report the panic kmsg data. This is the default behavior. - - -kexec_load_disabled: -==================== - -A toggle indicating if the kexec_load syscall has been disabled. This -value defaults to 0 (false: kexec_load enabled), but can be set to 1 -(true: kexec_load disabled). Once true, kexec can no longer be used, and -the toggle cannot be set back to false. This allows a kexec image to be -loaded before disabling the syscall, allowing a system to set up (and -later use) an image without it being altered. Generally used together -with the "modules_disabled" sysctl. - - -kptr_restrict: -============== - -This toggle indicates whether restrictions are placed on -exposing kernel addresses via /proc and other interfaces. - -When kptr_restrict is set to 0 (the default) the address is hashed before -printing. (This is the equivalent to %p.) - -When kptr_restrict is set to (1), kernel pointers printed using the %pK -format specifier will be replaced with 0's unless the user has CAP_SYSLOG -and effective user and group ids are equal to the real ids. This is -because %pK checks are done at read() time rather than open() time, so -if permissions are elevated between the open() and the read() (e.g via -a setuid binary) then %pK will not leak kernel pointers to unprivileged -users. Note, this is a temporary solution only. The correct long-term -solution is to do the permission checks at open() time. Consider removing -world read permissions from files that use %pK, and using dmesg_restrict -to protect against uses of %pK in dmesg(8) if leaking kernel pointer -values to unprivileged users is a concern. - -When kptr_restrict is set to (2), kernel pointers printed using -%pK will be replaced with 0's regardless of privileges. - - -l2cr: (PPC only) -================ - -This flag controls the L2 cache of G3 processor boards. If -0, the cache is disabled. Enabled if nonzero. - - -modules_disabled: -================= - -A toggle value indicating if modules are allowed to be loaded -in an otherwise modular kernel. This toggle defaults to off -(0), but can be set true (1). Once true, modules can be -neither loaded nor unloaded, and the toggle cannot be set back -to false. Generally used with the "kexec_load_disabled" toggle. - - -msg_next_id, sem_next_id, and shm_next_id: -========================================== - -These three toggles allows to specify desired id for next allocated IPC -object: message, semaphore or shared memory respectively. - -By default they are equal to -1, which means generic allocation logic. -Possible values to set are in range {0..INT_MAX}. - -Notes: - 1) kernel doesn't guarantee, that new object will have desired id. So, - it's up to userspace, how to handle an object with "wrong" id. - 2) Toggle with non-default value will be set back to -1 by kernel after - successful IPC object allocation. If an IPC object allocation syscall - fails, it is undefined if the value remains unmodified or is reset to -1. - - -nmi_watchdog: -============= - -This parameter can be used to control the NMI watchdog -(i.e. the hard lockup detector) on x86 systems. - -0 - disable the hard lockup detector - -1 - enable the hard lockup detector - -The hard lockup detector monitors each CPU for its ability to respond to -timer interrupts. The mechanism utilizes CPU performance counter registers -that are programmed to generate Non-Maskable Interrupts (NMIs) periodically -while a CPU is busy. Hence, the alternative name 'NMI watchdog'. - -The NMI watchdog is disabled by default if the kernel is running as a guest -in a KVM virtual machine. This default can be overridden by adding:: - - nmi_watchdog=1 - -to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst). - - -numa_balancing: -=============== - -Enables/disables automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes -that access it often. - -Enables/disables automatic NUMA memory balancing. On NUMA machines, there -is a performance penalty if remote memory is accessed by a CPU. When this -feature is enabled the kernel samples what task thread is accessing memory -by periodically unmapping pages and later trapping a page fault. At the -time of the page fault, it is determined if the data being accessed should -be migrated to a local memory node. - -The unmapping of pages and trapping faults incur additional overhead that -ideally is offset by improved memory locality but there is no universal -guarantee. If the target workload is already bound to NUMA nodes then this -feature should be disabled. Otherwise, if the system overhead from the -feature is too high then the rate the kernel samples for NUMA hinting -faults may be controlled by the numa_balancing_scan_period_min_ms, -numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. - -numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb -=============================================================================================================================== - - -Automatic NUMA balancing scans tasks address space and unmaps pages to -detect if pages are properly placed or if the data should be migrated to a -memory node local to where the task is running. Every "scan delay" the task -scans the next "scan size" number of pages in its address space. When the -end of the address space is reached the scanner restarts from the beginning. - -In combination, the "scan delay" and "scan size" determine the scan rate. -When "scan delay" decreases, the scan rate increases. The scan delay and -hence the scan rate of every task is adaptive and depends on historical -behaviour. If pages are properly placed then the scan delay increases, -otherwise the scan delay decreases. The "scan size" is not adaptive but -the higher the "scan size", the higher the scan rate. - -Higher scan rates incur higher system overhead as page faults must be -trapped and potentially data must be migrated. However, the higher the scan -rate, the more quickly a tasks memory is migrated to a local node if the -workload pattern changes and minimises performance impact due to remote -memory accesses. These sysctls control the thresholds for scan delays and -the number of pages scanned. - -numa_balancing_scan_period_min_ms is the minimum time in milliseconds to -scan a tasks virtual memory. It effectively controls the maximum scanning -rate for each task. - -numa_balancing_scan_delay_ms is the starting "scan delay" used for a task -when it initially forks. - -numa_balancing_scan_period_max_ms is the maximum time in milliseconds to -scan a tasks virtual memory. It effectively controls the minimum scanning -rate for each task. - -numa_balancing_scan_size_mb is how many megabytes worth of pages are -scanned for a given scan. - - -osrelease, ostype & version: -============================ - -:: - - # cat osrelease - 2.1.88 - # cat ostype - Linux - # cat version - #5 Wed Feb 25 21:49:24 MET 1998 - -The files osrelease and ostype should be clear enough. Version -needs a little more clarification however. The '#5' means that -this is the fifth kernel built from this source base and the -date behind it indicates the time the kernel was built. -The only way to tune these values is to rebuild the kernel :-) - - -overflowgid & overflowuid: -========================== - -if your architecture did not always support 32-bit UIDs (i.e. arm, -i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to -applications that use the old 16-bit UID/GID system calls, if the -actual UID or GID would exceed 65535. - -These sysctls allow you to change the value of the fixed UID and GID. -The default is 65534. - - -panic: -====== - -The value in this file represents the number of seconds the kernel -waits before rebooting on a panic. When you use the software watchdog, -the recommended setting is 60. - - -panic_on_io_nmi: -================ - -Controls the kernel's behavior when a CPU receives an NMI caused by -an IO error. - -0: try to continue operation (default) - -1: panic immediately. The IO error triggered an NMI. This indicates a - serious system condition which could result in IO data corruption. - Rather than continuing, panicking might be a better choice. Some - servers issue this sort of NMI when the dump button is pushed, - and you can use this option to take a crash dump. - - -panic_on_oops: -============== - -Controls the kernel's behaviour when an oops or BUG is encountered. - -0: try to continue operation - -1: panic immediately. If the `panic` sysctl is also non-zero then the - machine will be rebooted. - - -panic_on_stackoverflow: -======================= - -Controls the kernel's behavior when detecting the overflows of -kernel, IRQ and exception stacks except a user stack. -This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. - -0: try to continue operation. - -1: panic immediately. - - -panic_on_unrecovered_nmi: -========================= - -The default Linux behaviour on an NMI of either memory or unknown is -to continue operation. For many environments such as scientific -computing it is preferable that the box is taken out and the error -dealt with than an uncorrected parity/ECC error get propagated. - -A small number of systems do generate NMI's for bizarre random reasons -such as power management so the default is off. That sysctl works like -the existing panic controls already in that directory. - - -panic_on_warn: -============== - -Calls panic() in the WARN() path when set to 1. This is useful to avoid -a kernel rebuild when attempting to kdump at the location of a WARN(). - -0: only WARN(), default behaviour. - -1: call panic() after printing out WARN() location. - - -panic_print: -============ - -Bitmask for printing system info when panic happens. User can chose -combination of the following bits: - -===== ======================================== -bit 0 print all tasks info -bit 1 print system memory info -bit 2 print timer info -bit 3 print locks info if CONFIG_LOCKDEP is on -bit 4 print ftrace buffer -===== ======================================== - -So for example to print tasks and memory info on panic, user can:: - - echo 3 > /proc/sys/kernel/panic_print - - -panic_on_rcu_stall: -=================== - -When set to 1, calls panic() after RCU stall detection messages. This -is useful to define the root cause of RCU stalls using a vmcore. - -0: do not panic() when RCU stall takes place, default behavior. - -1: panic() after printing RCU stall messages. - - -perf_cpu_time_max_percent: -========================== - -Hints to the kernel how much CPU time it should be allowed to -use to handle perf sampling events. If the perf subsystem -is informed that its samples are exceeding this limit, it -will drop its sampling frequency to attempt to reduce its CPU -usage. - -Some perf sampling happens in NMIs. If these samples -unexpectedly take too long to execute, the NMIs can become -stacked up next to each other so much that nothing else is -allowed to execute. - -0: - disable the mechanism. Do not monitor or correct perf's - sampling rate no matter how CPU time it takes. - -1-100: - attempt to throttle perf's sample rate to this - percentage of CPU. Note: the kernel calculates an - "expected" length of each sample event. 100 here means - 100% of that expected length. Even if this is set to - 100, you may still see sample throttling if this - length is exceeded. Set to 0 if you truly do not care - how much CPU is consumed. - - -perf_event_paranoid: -==================== - -Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 2. - -=== ================================================================== - -1 Allow use of (almost) all events by all users - - Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK - ->=0 Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN - - Disallow raw tracepoint access by users without CAP_SYS_ADMIN - ->=1 Disallow CPU event access by users without CAP_SYS_ADMIN - ->=2 Disallow kernel profiling by users without CAP_SYS_ADMIN -=== ================================================================== - - -perf_event_max_stack: -===================== - -Controls maximum number of stack frames to copy for (attr.sample_type & -PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using -'perf record -g' or 'perf trace --call-graph fp'. - -This can only be done when no events are in use that have callchains -enabled, otherwise writing to this file will return -EBUSY. - -The default value is 127. - - -perf_event_mlock_kb: -==================== - -Control size of per-cpu ring buffer not counted agains mlock limit. - -The default value is 512 + 1 page - - -perf_event_max_contexts_per_stack: -================================== - -Controls maximum number of stack frame context entries for -(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for -instance, when using 'perf record -g' or 'perf trace --call-graph fp'. - -This can only be done when no events are in use that have callchains -enabled, otherwise writing to this file will return -EBUSY. - -The default value is 8. - - -pid_max: -======== - -PID allocation wrap value. When the kernel's next PID value -reaches this value, it wraps back to a minimum PID value. -PIDs of value pid_max or larger are not allocated. - - -ns_last_pid: -============ - -The last pid allocated in the current (the one task using this sysctl -lives in) pid namespace. When selecting a pid for a next task on fork -kernel tries to allocate a number starting from this one. - - -powersave-nap: (PPC only) -========================= - -If set, Linux-PPC will use the 'nap' mode of powersaving, -otherwise the 'doze' mode will be used. - -============================================================== - -printk: -======= - -The four values in printk denote: console_loglevel, -default_message_loglevel, minimum_console_loglevel and -default_console_loglevel respectively. - -These values influence printk() behavior when printing or -logging error messages. See 'man 2 syslog' for more info on -the different loglevels. - -- console_loglevel: - messages with a higher priority than - this will be printed to the console -- default_message_loglevel: - messages without an explicit priority - will be printed with this priority -- minimum_console_loglevel: - minimum (highest) value to which - console_loglevel can be set -- default_console_loglevel: - default value for console_loglevel - - -printk_delay: -============= - -Delay each printk message in printk_delay milliseconds - -Value from 0 - 10000 is allowed. - - -printk_ratelimit: -================= - -Some warning messages are rate limited. printk_ratelimit specifies -the minimum length of time between these messages (in jiffies), by -default we allow one every 5 seconds. - -A value of 0 will disable rate limiting. - - -printk_ratelimit_burst: -======================= - -While long term we enforce one message per printk_ratelimit -seconds, we do allow a burst of messages to pass through. -printk_ratelimit_burst specifies the number of messages we can -send before ratelimiting kicks in. - - -printk_devkmsg: -=============== - -Control the logging to /dev/kmsg from userspace: - -ratelimit: - default, ratelimited - -on: unlimited logging to /dev/kmsg from userspace - -off: logging to /dev/kmsg disabled - -The kernel command line parameter printk.devkmsg= overrides this and is -a one-time setting until next reboot: once set, it cannot be changed by -this sysctl interface anymore. - - -randomize_va_space: -=================== - -This option can be used to select the type of process address -space randomization that is used in the system, for architectures -that support this feature. - -== =========================================================================== -0 Turn the process address space randomization off. This is the - default for architectures that do not support this feature anyways, - and kernels that are booted with the "norandmaps" parameter. - -1 Make the addresses of mmap base, stack and VDSO page randomized. - This, among other things, implies that shared libraries will be - loaded to random addresses. Also for PIE-linked binaries, the - location of code start is randomized. This is the default if the - CONFIG_COMPAT_BRK option is enabled. - -2 Additionally enable heap randomization. This is the default if - CONFIG_COMPAT_BRK is disabled. - - There are a few legacy applications out there (such as some ancient - versions of libc.so.5 from 1996) that assume that brk area starts - just after the end of the code+bss. These applications break when - start of the brk area is randomized. There are however no known - non-legacy applications that would be broken this way, so for most - systems it is safe to choose full randomization. - - Systems with ancient and/or broken binaries should be configured - with CONFIG_COMPAT_BRK enabled, which excludes the heap from process - address space randomization. -== =========================================================================== - - -reboot-cmd: (Sparc only) -======================== - -??? This seems to be a way to give an argument to the Sparc -ROM/Flash boot loader. Maybe to tell it what to do after -rebooting. ??? - - -rtsig-max & rtsig-nr: -===================== - -The file rtsig-max can be used to tune the maximum number -of POSIX realtime (queued) signals that can be outstanding -in the system. - -rtsig-nr shows the number of RT signals currently queued. - - -sched_energy_aware: -=================== - -Enables/disables Energy Aware Scheduling (EAS). EAS starts -automatically on platforms where it can run (that is, -platforms with asymmetric CPU topologies and having an Energy -Model available). If your platform happens to meet the -requirements for EAS but you do not want to use it, change -this value to 0. - - -sched_schedstats: -================= - -Enables/disables scheduler statistics. Enabling this feature -incurs a small amount of overhead in the scheduler but is -useful for debugging and performance tuning. - - -sg-big-buff: -============ - -This file shows the size of the generic SCSI (sg) buffer. -You can't tune it just yet, but you could change it on -compile time by editing include/scsi/sg.h and changing -the value of SG_BIG_BUFF. - -There shouldn't be any reason to change this value. If -you can come up with one, you probably know what you -are doing anyway :) - - -shmall: -======= - -This parameter sets the total amount of shared memory pages that -can be used system wide. Hence, SHMALL should always be at least -ceil(shmmax/PAGE_SIZE). - -If you are not sure what the default PAGE_SIZE is on your Linux -system, you can run the following command: - - # getconf PAGE_SIZE - - -shmmax: -======= - -This value can be used to query and set the run time limit -on the maximum shared memory segment size that can be created. -Shared memory segments up to 1Gb are now supported in the -kernel. This value defaults to SHMMAX. - - -shm_rmid_forced: -================ - -Linux lets you set resource limits, including how much memory one -process can consume, via setrlimit(2). Unfortunately, shared memory -segments are allowed to exist without association with any process, and -thus might not be counted against any resource limits. If enabled, -shared memory segments are automatically destroyed when their attach -count becomes zero after a detach or a process termination. It will -also destroy segments that were created, but never attached to, on exit -from the process. The only use left for IPC_RMID is to immediately -destroy an unattached segment. Of course, this breaks the way things are -defined, so some applications might stop working. Note that this -feature will do you no good unless you also configure your resource -limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't -need this. - -Note that if you change this from 0 to 1, already created segments -without users and with a dead originative process will be destroyed. - - -sysctl_writes_strict: -===================== - -Control how file position affects the behavior of updating sysctl values -via the /proc/sys interface: - - == ====================================================================== - -1 Legacy per-write sysctl value handling, with no printk warnings. - Each write syscall must fully contain the sysctl value to be - written, and multiple writes on the same sysctl file descriptor - will rewrite the sysctl value, regardless of file position. - 0 Same behavior as above, but warn about processes that perform writes - to a sysctl file descriptor when the file position is not 0. - 1 (default) Respect file position when writing sysctl strings. Multiple - writes will append to the sysctl value buffer. Anything past the max - length of the sysctl value buffer will be ignored. Writes to numeric - sysctl entries must always be at file position 0 and the value must - be fully contained in the buffer sent in the write syscall. - == ====================================================================== - - -softlockup_all_cpu_backtrace: -============================= - -This value controls the soft lockup detector thread's behavior -when a soft lockup condition is detected as to whether or not -to gather further debug information. If enabled, each cpu will -be issued an NMI and instructed to capture stack trace. - -This feature is only applicable for architectures which support -NMI. - -0: do nothing. This is the default behavior. - -1: on detection capture more debug information. - - -soft_watchdog: -============== - -This parameter can be used to control the soft lockup detector. - - 0 - disable the soft lockup detector - - 1 - enable the soft lockup detector - -The soft lockup detector monitors CPUs for threads that are hogging the CPUs -without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads -from running. The mechanism depends on the CPUs ability to respond to timer -interrupts which are needed for the 'watchdog/N' threads to be woken up by -the watchdog timer function, otherwise the NMI watchdog - if enabled - can -detect a hard lockup condition. - - -stack_erasing: -============== - -This parameter can be used to control kernel stack erasing at the end -of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. - -That erasing reduces the information which kernel stack leak bugs -can reveal and blocks some uninitialized stack variable attacks. -The tradeoff is the performance impact: on a single CPU system kernel -compilation sees a 1% slowdown, other systems and workloads may vary. - - 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. - - 1: kernel stack erasing is enabled (default), it is performed before - returning to the userspace at the end of syscalls. - - -tainted -======= - -Non-zero if the kernel has been tainted. Numeric values, which can be -ORed together. The letters are seen in "Tainted" line of Oops reports. - -====== ===== ============================================================== - 1 `(P)` proprietary module was loaded - 2 `(F)` module was force loaded - 4 `(S)` SMP kernel oops on an officially SMP incapable processor - 8 `(R)` module was force unloaded - 16 `(M)` processor reported a Machine Check Exception (MCE) - 32 `(B)` bad page referenced or some unexpected page flags - 64 `(U)` taint requested by userspace application - 128 `(D)` kernel died recently, i.e. there was an OOPS or BUG - 256 `(A)` an ACPI table was overridden by user - 512 `(W)` kernel issued warning - 1024 `(C)` staging driver was loaded - 2048 `(I)` workaround for bug in platform firmware applied - 4096 `(O)` externally-built ("out-of-tree") module was loaded - 8192 `(E)` unsigned module was loaded - 16384 `(L)` soft lockup occurred - 32768 `(K)` kernel has been live patched - 65536 `(X)` Auxiliary taint, defined and used by for distros -131072 `(T)` The kernel was built with the struct randomization plugin -====== ===== ============================================================== - -See Documentation/admin-guide/tainted-kernels.rst for more information. - - -threads-max: -============ - -This value controls the maximum number of threads that can be created -using fork(). - -During initialization the kernel sets this value such that even if the -maximum number of threads is created, the thread structures occupy only -a part (1/8th) of the available RAM pages. - -The minimum value that can be written to threads-max is 20. - -The maximum value that can be written to threads-max is given by the -constant FUTEX_TID_MASK (0x3fffffff). - -If a value outside of this range is written to threads-max an error -EINVAL occurs. - -The value written is checked against the available RAM pages. If the -thread structures would occupy too much (more than 1/8th) of the -available RAM pages threads-max is reduced accordingly. - - -unknown_nmi_panic: -================== - -The value in this file affects behavior of handling NMI. When the -value is non-zero, unknown NMI is trapped and then panic occurs. At -that time, kernel debugging information is displayed on console. - -NMI switch that most IA32 servers have fires unknown NMI up, for -example. If a system hangs up, try pressing the NMI switch. - - -watchdog: -========= - -This parameter can be used to disable or enable the soft lockup detector -_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time. - - 0 - disable both lockup detectors - - 1 - enable both lockup detectors - -The soft lockup detector and the NMI watchdog can also be disabled or -enabled individually, using the soft_watchdog and nmi_watchdog parameters. -If the watchdog parameter is read, for example by executing:: - - cat /proc/sys/kernel/watchdog - -the output of this command (0 or 1) shows the logical OR of soft_watchdog -and nmi_watchdog. - - -watchdog_cpumask: -================= - -This value can be used to control on which cpus the watchdog may run. -The default cpumask is all possible cores, but if NO_HZ_FULL is -enabled in the kernel config, and cores are specified with the -nohz_full= boot argument, those cores are excluded by default. -Offline cores can be included in this mask, and if the core is later -brought online, the watchdog will be started based on the mask value. - -Typically this value would only be touched in the nohz_full case -to re-enable cores that by default were not running the watchdog, -if a kernel lockup was suspected on those cores. - -The argument value is the standard cpulist format for cpumasks, -so for example to enable the watchdog on cores 0, 2, 3, and 4 you -might say:: - - echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask - - -watchdog_thresh: -================ - -This value can be used to control the frequency of hrtimer and NMI -events and the soft and hard lockup thresholds. The default threshold -is 10 seconds. - -The softlockup threshold is (2 * watchdog_thresh). Setting this -tunable to zero will disable lockup detection altogether. diff --git a/Documentation/sysctl/net.rst b/Documentation/sysctl/net.rst deleted file mode 100644 index a7d44e71019d..000000000000 --- a/Documentation/sysctl/net.rst +++ /dev/null @@ -1,461 +0,0 @@ -================================ -Documentation for /proc/sys/net/ -================================ - -Copyright - -Copyright (c) 1999 - - - Terrehon Bowden - - Bodo Bauer - -Copyright (c) 2000 - - - Jorge Nerin - -Copyright (c) 2009 - - - Shen Feng - -For general info and legal blurb, please look in index.rst. - ------------------------------------------------------------------------------- - -This file contains the documentation for the sysctl files in -/proc/sys/net - -The interface to the networking parts of the kernel is located in -/proc/sys/net. The following table shows all possible subdirectories. You may -see only some of them, depending on your kernel's configuration. - - -Table : Subdirectories in /proc/sys/net - - ========= =================== = ========== ================== - Directory Content Directory Content - ========= =================== = ========== ================== - core General parameter appletalk Appletalk protocol - unix Unix domain sockets netrom NET/ROM - 802 E802 protocol ax25 AX25 - ethernet Ethernet protocol rose X.25 PLP layer - ipv4 IP version 4 x25 X.25 protocol - ipx IPX token-ring IBM token ring - bridge Bridging decnet DEC net - ipv6 IP version 6 tipc TIPC - ========= =================== = ========== ================== - -1. /proc/sys/net/core - Network core options -============================================ - -bpf_jit_enable --------------- - -This enables the BPF Just in Time (JIT) compiler. BPF is a flexible -and efficient infrastructure allowing to execute bytecode at various -hook points. It is used in a number of Linux kernel subsystems such -as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints) -and security (e.g. seccomp). LLVM has a BPF back end that can compile -restricted C into a sequence of BPF instructions. After program load -through bpf(2) and passing a verifier in the kernel, a JIT will then -translate these BPF proglets into native CPU instructions. There are -two flavors of JITs, the newer eBPF JIT currently supported on: - - - x86_64 - - x86_32 - - arm64 - - arm32 - - ppc64 - - sparc64 - - mips64 - - s390x - - riscv - -And the older cBPF JIT supported on the following archs: - - - mips - - ppc - - sparc - -eBPF JITs are a superset of cBPF JITs, meaning the kernel will -migrate cBPF instructions into eBPF instructions and then JIT -compile them transparently. Older cBPF JITs can only translate -tcpdump filters, seccomp rules, etc, but not mentioned eBPF -programs loaded through bpf(2). - -Values: - - - 0 - disable the JIT (default value) - - 1 - enable the JIT - - 2 - enable the JIT and ask the compiler to emit traces on kernel log. - -bpf_jit_harden --------------- - -This enables hardening for the BPF JIT compiler. Supported are eBPF -JIT backends. Enabling hardening trades off performance, but can -mitigate JIT spraying. - -Values: - - - 0 - disable JIT hardening (default value) - - 1 - enable JIT hardening for unprivileged users only - - 2 - enable JIT hardening for all users - -bpf_jit_kallsyms ----------------- - -When BPF JIT compiler is enabled, then compiled images are unknown -addresses to the kernel, meaning they neither show up in traces nor -in /proc/kallsyms. This enables export of these addresses, which can -be used for debugging/tracing. If bpf_jit_harden is enabled, this -feature is disabled. - -Values : - - - 0 - disable JIT kallsyms export (default value) - - 1 - enable JIT kallsyms export for privileged users only - -bpf_jit_limit -------------- - -This enforces a global limit for memory allocations to the BPF JIT -compiler in order to reject unprivileged JIT requests once it has -been surpassed. bpf_jit_limit contains the value of the global limit -in bytes. - -dev_weight ----------- - -The maximum number of packets that kernel can handle on a NAPI interrupt, -it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware -aggregated packet is counted as one packet in this context. - -Default: 64 - -dev_weight_rx_bias ------------------- - -RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function -of the driver for the per softirq cycle netdev_budget. This parameter influences -the proportion of the configured netdev_budget that is spent on RPS based packet -processing during RX softirq cycles. It is further meant for making current -dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack. -(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based -on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias). - -Default: 1 - -dev_weight_tx_bias ------------------- - -Scales the maximum number of packets that can be processed during a TX softirq cycle. -Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric -net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog. - -Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias). - -Default: 1 - -default_qdisc -------------- - -The default queuing discipline to use for network devices. This allows -overriding the default of pfifo_fast with an alternative. Since the default -queuing discipline is created without additional parameters so is best suited -to queuing disciplines that work well without configuration like stochastic -fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use -queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin -which require setting up classes and bandwidths. Note that physical multiqueue -interfaces still use mq as root qdisc, which in turn uses this default for its -leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead -default to noqueue. - -Default: pfifo_fast - -busy_read ---------- - -Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL) -Approximate time in us to busy loop waiting for packets on the device queue. -This sets the default value of the SO_BUSY_POLL socket option. -Can be set or overridden per socket by setting socket option SO_BUSY_POLL, -which is the preferred method of enabling. If you need to enable the feature -globally via sysctl, a value of 50 is recommended. - -Will increase power usage. - -Default: 0 (off) - -busy_poll ----------------- -Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL) -Approximate time in us to busy loop waiting for events. -Recommended value depends on the number of sockets you poll on. -For several sockets 50, for several hundreds 100. -For more than that you probably want to use epoll. -Note that only sockets with SO_BUSY_POLL set will be busy polled, -so you want to either selectively set SO_BUSY_POLL on those sockets or set -sysctl.net.busy_read globally. - -Will increase power usage. - -Default: 0 (off) - -rmem_default ------------- - -The default setting of the socket receive buffer in bytes. - -rmem_max --------- - -The maximum receive socket buffer size in bytes. - -tstamp_allow_data ------------------ -Allow processes to receive tx timestamps looped together with the original -packet contents. If disabled, transmit timestamp requests from unprivileged -processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set. - -Default: 1 (on) - - -wmem_default ------------- - -The default setting (in bytes) of the socket send buffer. - -wmem_max --------- - -The maximum send socket buffer size in bytes. - -message_burst and message_cost ------------------------------- - -These parameters are used to limit the warning messages written to the kernel -log from the networking code. They enforce a rate limit to make a -denial-of-service attack impossible. A higher message_cost factor, results in -fewer messages that will be written. Message_burst controls when messages will -be dropped. The default settings limit warning messages to one every five -seconds. - -warnings --------- - -This sysctl is now unused. - -This was used to control console messages from the networking stack that -occur because of problems on the network like duplicate address or bad -checksums. - -These messages are now emitted at KERN_DEBUG and can generally be enabled -and controlled by the dynamic_debug facility. - -netdev_budget -------------- - -Maximum number of packets taken from all interfaces in one polling cycle (NAPI -poll). In one polling cycle interfaces which are registered to polling are -probed in a round-robin manner. Also, a polling cycle may not exceed -netdev_budget_usecs microseconds, even if netdev_budget has not been -exhausted. - -netdev_budget_usecs ---------------------- - -Maximum number of microseconds in one NAPI polling cycle. Polling -will exit when either netdev_budget_usecs have elapsed during the -poll cycle or the number of packets processed reaches netdev_budget. - -netdev_max_backlog ------------------- - -Maximum number of packets, queued on the INPUT side, when the interface -receives packets faster than kernel can process them. - -netdev_rss_key --------------- - -RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is -randomly generated. -Some user space might need to gather its content even if drivers do not -provide ethtool -x support yet. - -:: - - myhost:~# cat /proc/sys/net/core/netdev_rss_key - 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total) - -File contains nul bytes if no driver ever called netdev_rss_key_fill() function. - -Note: - /proc/sys/net/core/netdev_rss_key contains 52 bytes of key, - but most drivers only use 40 bytes of it. - -:: - - myhost:~# ethtool -x eth0 - RX flow hash indirection table for eth0 with 8 RX ring(s): - 0: 0 1 2 3 4 5 6 7 - RSS hash key: - 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89 - -netdev_tstamp_prequeue ----------------------- - -If set to 0, RX packet timestamps can be sampled after RPS processing, when -the target CPU processes packets. It might give some delay on timestamps, but -permit to distribute the load on several cpus. - -If set to 1 (default), timestamps are sampled as soon as possible, before -queueing. - -optmem_max ----------- - -Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence -of struct cmsghdr structures with appended data. - -fb_tunnels_only_for_init_net ----------------------------- - -Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0, -sit0, ip6tnl0, ip6gre0) are automatically created when a new -network namespace is created, if corresponding tunnel is present -in initial network namespace. -If set to 1, these devices are not automatically created, and -user space is responsible for creating them if needed. - -Default : 0 (for compatibility reasons) - -devconf_inherit_init_net ------------------------- - -Controls if a new network namespace should inherit all current -settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By -default, we keep the current behavior: for IPv4 we inherit all current -settings from init_net and for IPv6 we reset all settings to default. - -If set to 1, both IPv4 and IPv6 settings are forced to inherit from -current ones in init_net. If set to 2, both IPv4 and IPv6 settings are -forced to reset to their default values. - -Default : 0 (for compatibility reasons) - -2. /proc/sys/net/unix - Parameters for Unix domain sockets ----------------------------------------------------------- - -There is only one file in this directory. -unix_dgram_qlen limits the max number of datagrams queued in Unix domain -socket's buffer. It will not take effect unless PF_UNIX flag is specified. - - -3. /proc/sys/net/ipv4 - IPV4 settings -------------------------------------- -Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for -descriptions of these entries. - - -4. Appletalk ------------- - -The /proc/sys/net/appletalk directory holds the Appletalk configuration data -when Appletalk is loaded. The configurable parameters are: - -aarp-expiry-time ----------------- - -The amount of time we keep an ARP entry before expiring it. Used to age out -old hosts. - -aarp-resolve-time ------------------ - -The amount of time we will spend trying to resolve an Appletalk address. - -aarp-retransmit-limit ---------------------- - -The number of times we will retransmit a query before giving up. - -aarp-tick-time --------------- - -Controls the rate at which expires are checked. - -The directory /proc/net/appletalk holds the list of active Appletalk sockets -on a machine. - -The fields indicate the DDP type, the local address (in network:node format) -the remote address, the size of the transmit pending queue, the size of the -received queue (bytes waiting for applications to read) the state and the uid -owning the socket. - -/proc/net/atalk_iface lists all the interfaces configured for appletalk.It -shows the name of the interface, its Appletalk address, the network range on -that address (or network number for phase 1 networks), and the status of the -interface. - -/proc/net/atalk_route lists each known network route. It lists the target -(network) that the route leads to, the router (may be directly connected), the -route flags, and the device the route is using. - - -5. IPX ------- - -The IPX protocol has no tunable values in proc/sys/net. - -The IPX protocol does, however, provide proc/net/ipx. This lists each IPX -socket giving the local and remote addresses in Novell format (that is -network:node:port). In accordance with the strange Novell tradition, -everything but the port is in hex. Not_Connected is displayed for sockets that -are not tied to a specific remote address. The Tx and Rx queue sizes indicate -the number of bytes pending for transmission and reception. The state -indicates the state the socket is in and the uid is the owning uid of the -socket. - -The /proc/net/ipx_interface file lists all IPX interfaces. For each interface -it gives the network number, the node number, and indicates if the network is -the primary network. It also indicates which device it is bound to (or -Internal for internal networks) and the Frame Type if appropriate. Linux -supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for -IPX. - -The /proc/net/ipx_route table holds a list of IPX routes. For each route it -gives the destination network, the router node (or Directly) and the network -address of the router (or Connected) for internal networks. - -6. TIPC -------- - -tipc_rmem ---------- - -The TIPC protocol now has a tunable for the receive memory, similar to the -tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max) - -:: - - # cat /proc/sys/net/tipc/tipc_rmem - 4252725 34021800 68043600 - # - -The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values -are scaled (shifted) versions of that same value. Note that the min value -is not at this point in time used in any meaningful way, but the triplet is -preserved in order to be consistent with things like tcp_rmem. - -named_timeout -------------- - -TIPC name table updates are distributed asynchronously in a cluster, without -any form of transaction handling. This means that different race scenarios are -possible. One such is that a name withdrawal sent out by one node and received -by another node may arrive after a second, overlapping name publication already -has been accepted from a third node, although the conflicting updates -originally may have been issued in the correct sequential order. -If named_timeout is nonzero, failed topology updates will be placed on a defer -queue until another event arrives that clears the error, or until the timeout -expires. Value is in milliseconds. diff --git a/Documentation/sysctl/sunrpc.rst b/Documentation/sysctl/sunrpc.rst deleted file mode 100644 index 09780a682afd..000000000000 --- a/Documentation/sysctl/sunrpc.rst +++ /dev/null @@ -1,25 +0,0 @@ -=================================== -Documentation for /proc/sys/sunrpc/ -=================================== - -kernel version 2.2.10 - -Copyright (c) 1998, 1999, Rik van Riel - -For general info and legal blurb, please look in index.rst. - ------------------------------------------------------------------------------- - -This file contains the documentation for the sysctl files in -/proc/sys/sunrpc and is valid for Linux kernel version 2.2. - -The files in this directory can be used to (re)set the debug -flags of the SUN Remote Procedure Call (RPC) subsystem in -the Linux kernel. This stuff is used for NFS, KNFSD and -maybe a few other things as well. - -The files in there are used to control the debugging flags: -rpc_debug, nfs_debug, nfsd_debug and nlm_debug. - -These flags are for kernel hackers only. You should read the -source code in net/sunrpc/ for more information. diff --git a/Documentation/sysctl/user.rst b/Documentation/sysctl/user.rst deleted file mode 100644 index 650eaa03f15e..000000000000 --- a/Documentation/sysctl/user.rst +++ /dev/null @@ -1,78 +0,0 @@ -================================= -Documentation for /proc/sys/user/ -================================= - -kernel version 4.9.0 - -Copyright (c) 2016 Eric Biederman - ------------------------------------------------------------------------------- - -This file contains the documentation for the sysctl files in -/proc/sys/user. - -The files in this directory can be used to override the default -limits on the number of namespaces and other objects that have -per user per user namespace limits. - -The primary purpose of these limits is to stop programs that -malfunction and attempt to create a ridiculous number of objects, -before the malfunction becomes a system wide problem. It is the -intention that the defaults of these limits are set high enough that -no program in normal operation should run into these limits. - -The creation of per user per user namespace objects are charged to -the user in the user namespace who created the object and -verified to be below the per user limit in that user namespace. - -The creation of objects is also charged to all of the users -who created user namespaces the creation of the object happens -in (user namespaces can be nested) and verified to be below the per user -limits in the user namespaces of those users. - -This recursive counting of created objects ensures that creating a -user namespace does not allow a user to escape their current limits. - -Currently, these files are in /proc/sys/user: - -max_cgroup_namespaces -===================== - - The maximum number of cgroup namespaces that any user in the current - user namespace may create. - -max_ipc_namespaces -================== - - The maximum number of ipc namespaces that any user in the current - user namespace may create. - -max_mnt_namespaces -================== - - The maximum number of mount namespaces that any user in the current - user namespace may create. - -max_net_namespaces -================== - - The maximum number of network namespaces that any user in the - current user namespace may create. - -max_pid_namespaces -================== - - The maximum number of pid namespaces that any user in the current - user namespace may create. - -max_user_namespaces -=================== - - The maximum number of user namespaces that any user in the current - user namespace may create. - -max_uts_namespaces -================== - - The maximum number of user namespaces that any user in the current - user namespace may create. diff --git a/Documentation/sysctl/vm.rst b/Documentation/sysctl/vm.rst deleted file mode 100644 index 5aceb5cd5ce7..000000000000 --- a/Documentation/sysctl/vm.rst +++ /dev/null @@ -1,964 +0,0 @@ -=============================== -Documentation for /proc/sys/vm/ -=============================== - -kernel version 2.6.29 - -Copyright (c) 1998, 1999, Rik van Riel - -Copyright (c) 2008 Peter W. Morreale - -For general info and legal blurb, please look in index.rst. - ------------------------------------------------------------------------------- - -This file contains the documentation for the sysctl files in -/proc/sys/vm and is valid for Linux kernel version 2.6.29. - -The files in this directory can be used to tune the operation -of the virtual memory (VM) subsystem of the Linux kernel and -the writeout of dirty data to disk. - -Default values and initialization routines for most of these -files can be found in mm/swap.c. - -Currently, these files are in /proc/sys/vm: - -- admin_reserve_kbytes -- block_dump -- compact_memory -- compact_unevictable_allowed -- dirty_background_bytes -- dirty_background_ratio -- dirty_bytes -- dirty_expire_centisecs -- dirty_ratio -- dirtytime_expire_seconds -- dirty_writeback_centisecs -- drop_caches -- extfrag_threshold -- hugetlb_shm_group -- laptop_mode -- legacy_va_layout -- lowmem_reserve_ratio -- max_map_count -- memory_failure_early_kill -- memory_failure_recovery -- min_free_kbytes -- min_slab_ratio -- min_unmapped_ratio -- mmap_min_addr -- mmap_rnd_bits -- mmap_rnd_compat_bits -- nr_hugepages -- nr_hugepages_mempolicy -- nr_overcommit_hugepages -- nr_trim_pages (only if CONFIG_MMU=n) -- numa_zonelist_order -- oom_dump_tasks -- oom_kill_allocating_task -- overcommit_kbytes -- overcommit_memory -- overcommit_ratio -- page-cluster -- panic_on_oom -- percpu_pagelist_fraction -- stat_interval -- stat_refresh -- numa_stat -- swappiness -- unprivileged_userfaultfd -- user_reserve_kbytes -- vfs_cache_pressure -- watermark_boost_factor -- watermark_scale_factor -- zone_reclaim_mode - - -admin_reserve_kbytes -==================== - -The amount of free memory in the system that should be reserved for users -with the capability cap_sys_admin. - -admin_reserve_kbytes defaults to min(3% of free pages, 8MB) - -That should provide enough for the admin to log in and kill a process, -if necessary, under the default overcommit 'guess' mode. - -Systems running under overcommit 'never' should increase this to account -for the full Virtual Memory Size of programs used to recover. Otherwise, -root may not be able to log in to recover the system. - -How do you calculate a minimum useful reserve? - -sshd or login + bash (or some other shell) + top (or ps, kill, etc.) - -For overcommit 'guess', we can sum resident set sizes (RSS). -On x86_64 this is about 8MB. - -For overcommit 'never', we can take the max of their virtual sizes (VSZ) -and add the sum of their RSS. -On x86_64 this is about 128MB. - -Changing this takes effect whenever an application requests memory. - - -block_dump -========== - -block_dump enables block I/O debugging when set to a nonzero value. More -information on block I/O debugging is in Documentation/laptops/laptop-mode.rst. - - -compact_memory -============== - -Available only when CONFIG_COMPACTION is set. When 1 is written to the file, -all zones are compacted such that free memory is available in contiguous -blocks where possible. This can be important for example in the allocation of -huge pages although processes will also directly compact memory as required. - - -compact_unevictable_allowed -=========================== - -Available only when CONFIG_COMPACTION is set. When set to 1, compaction is -allowed to examine the unevictable lru (mlocked pages) for pages to compact. -This should be used on systems where stalls for minor page faults are an -acceptable trade for large contiguous free memory. Set to 0 to prevent -compaction from moving pages that are unevictable. Default value is 1. - - -dirty_background_bytes -====================== - -Contains the amount of dirty memory at which the background kernel -flusher threads will start writeback. - -Note: - dirty_background_bytes is the counterpart of dirty_background_ratio. Only - one of them may be specified at a time. When one sysctl is written it is - immediately taken into account to evaluate the dirty memory limits and the - other appears as 0 when read. - - -dirty_background_ratio -====================== - -Contains, as a percentage of total available memory that contains free pages -and reclaimable pages, the number of pages at which the background kernel -flusher threads will start writing out dirty data. - -The total available memory is not equal to total system memory. - - -dirty_bytes -=========== - -Contains the amount of dirty memory at which a process generating disk writes -will itself start writeback. - -Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be -specified at a time. When one sysctl is written it is immediately taken into -account to evaluate the dirty memory limits and the other appears as 0 when -read. - -Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any -value lower than this limit will be ignored and the old configuration will be -retained. - - -dirty_expire_centisecs -====================== - -This tunable is used to define when dirty data is old enough to be eligible -for writeout by the kernel flusher threads. It is expressed in 100'ths -of a second. Data which has been dirty in-memory for longer than this -interval will be written out next time a flusher thread wakes up. - - -dirty_ratio -=========== - -Contains, as a percentage of total available memory that contains free pages -and reclaimable pages, the number of pages at which a process which is -generating disk writes will itself start writing out dirty data. - -The total available memory is not equal to total system memory. - - -dirtytime_expire_seconds -======================== - -When a lazytime inode is constantly having its pages dirtied, the inode with -an updated timestamp will never get chance to be written out. And, if the -only thing that has happened on the file system is a dirtytime inode caused -by an atime update, a worker will be scheduled to make sure that inode -eventually gets pushed out to disk. This tunable is used to define when dirty -inode is old enough to be eligible for writeback by the kernel flusher threads. -And, it is also used as the interval to wakeup dirtytime_writeback thread. - - -dirty_writeback_centisecs -========================= - -The kernel flusher threads will periodically wake up and write `old` data -out to disk. This tunable expresses the interval between those wakeups, in -100'ths of a second. - -Setting this to zero disables periodic writeback altogether. - - -drop_caches -=========== - -Writing to this will cause the kernel to drop clean caches, as well as -reclaimable slab objects like dentries and inodes. Once dropped, their -memory becomes free. - -To free pagecache:: - - echo 1 > /proc/sys/vm/drop_caches - -To free reclaimable slab objects (includes dentries and inodes):: - - echo 2 > /proc/sys/vm/drop_caches - -To free slab objects and pagecache:: - - echo 3 > /proc/sys/vm/drop_caches - -This is a non-destructive operation and will not free any dirty objects. -To increase the number of objects freed by this operation, the user may run -`sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the -number of dirty objects on the system and create more candidates to be -dropped. - -This file is not a means to control the growth of the various kernel caches -(inodes, dentries, pagecache, etc...) These objects are automatically -reclaimed by the kernel when memory is needed elsewhere on the system. - -Use of this file can cause performance problems. Since it discards cached -objects, it may cost a significant amount of I/O and CPU to recreate the -dropped objects, especially if they were under heavy use. Because of this, -use outside of a testing or debugging environment is not recommended. - -You may see informational messages in your kernel log when this file is -used:: - - cat (1234): drop_caches: 3 - -These are informational only. They do not mean that anything is wrong -with your system. To disable them, echo 4 (bit 2) into drop_caches. - - -extfrag_threshold -================= - -This parameter affects whether the kernel will compact memory or direct -reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in -debugfs shows what the fragmentation index for each order is in each zone in -the system. Values tending towards 0 imply allocations would fail due to lack -of memory, values towards 1000 imply failures are due to fragmentation and -1 -implies that the allocation will succeed as long as watermarks are met. - -The kernel will not compact memory in a zone if the -fragmentation index is <= extfrag_threshold. The default value is 500. - - -highmem_is_dirtyable -==================== - -Available only for systems with CONFIG_HIGHMEM enabled (32b systems). - -This parameter controls whether the high memory is considered for dirty -writers throttling. This is not the case by default which means that -only the amount of memory directly visible/usable by the kernel can -be dirtied. As a result, on systems with a large amount of memory and -lowmem basically depleted writers might be throttled too early and -streaming writes can get very slow. - -Changing the value to non zero would allow more memory to be dirtied -and thus allow writers to write more data which can be flushed to the -storage more effectively. Note this also comes with a risk of pre-mature -OOM killer because some writers (e.g. direct block device writes) can -only use the low memory and they can fill it up with dirty data without -any throttling. - - -hugetlb_shm_group -================= - -hugetlb_shm_group contains group id that is allowed to create SysV -shared memory segment using hugetlb page. - - -laptop_mode -=========== - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/laptops/laptop-mode.rst. - - -legacy_va_layout -================ - -If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel -will use the legacy (2.4) layout for all processes. - - -lowmem_reserve_ratio -==================== - -For some specialised workloads on highmem machines it is dangerous for -the kernel to allow process memory to be allocated from the "lowmem" -zone. This is because that memory could then be pinned via the mlock() -system call, or by unavailability of swapspace. - -And on large highmem machines this lack of reclaimable lowmem memory -can be fatal. - -So the Linux page allocator has a mechanism which prevents allocations -which *could* use highmem from using too much lowmem. This means that -a certain amount of lowmem is defended from the possibility of being -captured into pinned user memory. - -(The same argument applies to the old 16 megabyte ISA DMA region. This -mechanism will also defend that region from allocations which could use -highmem or lowmem). - -The `lowmem_reserve_ratio` tunable determines how aggressive the kernel is -in defending these lower zones. - -If you have a machine which uses highmem or ISA DMA and your -applications are using mlock(), or if you are running with no swap then -you probably should change the lowmem_reserve_ratio setting. - -The lowmem_reserve_ratio is an array. You can see them by reading this file:: - - % cat /proc/sys/vm/lowmem_reserve_ratio - 256 256 32 - -But, these values are not used directly. The kernel calculates # of protection -pages for each zones from them. These are shown as array of protection pages -in /proc/zoneinfo like followings. (This is an example of x86-64 box). -Each zone has an array of protection pages like this:: - - Node 0, zone DMA - pages free 1355 - min 3 - low 3 - high 4 - : - : - numa_other 0 - protection: (0, 2004, 2004, 2004) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - pagesets - cpu: 0 pcp: 0 - : - -These protections are added to score to judge whether this zone should be used -for page allocation or should be reclaimed. - -In this example, if normal pages (index=2) are required to this DMA zone and -watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should -not be used because pages_free(1355) is smaller than watermark + protection[2] -(4 + 2004 = 2008). If this protection value is 0, this zone would be used for -normal page requirement. If requirement is DMA zone(index=0), protection[0] -(=0) is used. - -zone[i]'s protection[j] is calculated by following expression:: - - (i < j): - zone[i]->protection[j] - = (total sums of managed_pages from zone[i+1] to zone[j] on the node) - / lowmem_reserve_ratio[i]; - (i = j): - (should not be protected. = 0; - (i > j): - (not necessary, but looks 0) - -The default values of lowmem_reserve_ratio[i] are - - === ==================================== - 256 (if zone[i] means DMA or DMA32 zone) - 32 (others) - === ==================================== - -As above expression, they are reciprocal number of ratio. -256 means 1/256. # of protection pages becomes about "0.39%" of total managed -pages of higher zones on the node. - -If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). The value less than 1 completely -disables protection of the pages. - - -max_map_count: -============== - -This file contains the maximum number of memory map areas a process -may have. Memory map areas are used as a side-effect of calling -malloc, directly by mmap, mprotect, and madvise, and also when loading -shared libraries. - -While most applications need less than a thousand maps, certain -programs, particularly malloc debuggers, may consume lots of them, -e.g., up to one or two maps per allocation. - -The default value is 65536. - - -memory_failure_early_kill: -========================== - -Control how to kill processes when uncorrected memory error (typically -a 2bit error in a memory module) is detected in the background by hardware -that cannot be handled by the kernel. In some cases (like the page -still having a valid copy on disk) the kernel will handle the failure -transparently without affecting any applications. But if there is -no other uptodate copy of the data it will kill to prevent any data -corruptions from propagating. - -1: Kill all processes that have the corrupted and not reloadable page mapped -as soon as the corruption is detected. Note this is not supported -for a few types of pages, like kernel internally allocated data or -the swap cache, but works for the majority of user pages. - -0: Only unmap the corrupted page from all processes and only kill a process -who tries to access it. - -The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can -handle this if they want to. - -This is only active on architectures/platforms with advanced machine -check handling and depends on the hardware capabilities. - -Applications can override this setting individually with the PR_MCE_KILL prctl - - -memory_failure_recovery -======================= - -Enable memory failure recovery (when supported by the platform) - -1: Attempt recovery. - -0: Always panic on a memory failure. - - -min_free_kbytes -=============== - -This is used to force the Linux VM to keep a minimum number -of kilobytes free. The VM uses this number to compute a -watermark[WMARK_MIN] value for each lowmem zone in the system. -Each lowmem zone gets a number of reserved free pages based -proportionally on its size. - -Some minimal amount of memory is needed to satisfy PF_MEMALLOC -allocations; if you set this to lower than 1024KB, your system will -become subtly broken, and prone to deadlock under high loads. - -Setting this too high will OOM your machine instantly. - - -min_slab_ratio -============== - -This is available only on NUMA kernels. - -A percentage of the total pages in each zone. On Zone reclaim -(fallback from the local zone occurs) slabs will be reclaimed if more -than this percentage of pages in a zone are reclaimable slab pages. -This insures that the slab growth stays under control even in NUMA -systems that rarely perform global reclaim. - -The default is 5 percent. - -Note that slab reclaim is triggered in a per zone / node fashion. -The process of reclaiming slab memory is currently not node specific -and may not be fast. - - -min_unmapped_ratio -================== - -This is available only on NUMA kernels. - -This is a percentage of the total pages in each zone. Zone reclaim will -only occur if more than this percentage of pages are in a state that -zone_reclaim_mode allows to be reclaimed. - -If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared -against all file-backed unmapped pages including swapcache pages and tmpfs -files. Otherwise, only unmapped pages backed by normal files but not tmpfs -files and similar are considered. - -The default is 1 percent. - - -mmap_min_addr -============= - -This file indicates the amount of address space which a user process will -be restricted from mmapping. Since kernel null dereference bugs could -accidentally operate based on the information in the first couple of pages -of memory userspace processes should not be allowed to write to them. By -default this value is set to 0 and no protections will be enforced by the -security module. Setting this value to something like 64k will allow the -vast majority of applications to work correctly and provide defense in depth -against future potential kernel bugs. - - -mmap_rnd_bits -============= - -This value can be used to select the number of bits to use to -determine the random offset to the base address of vma regions -resulting from mmap allocations on architectures which support -tuning address space randomization. This value will be bounded -by the architecture's minimum and maximum supported values. - -This value can be changed after boot using the -/proc/sys/vm/mmap_rnd_bits tunable - - -mmap_rnd_compat_bits -==================== - -This value can be used to select the number of bits to use to -determine the random offset to the base address of vma regions -resulting from mmap allocations for applications run in -compatibility mode on architectures which support tuning address -space randomization. This value will be bounded by the -architecture's minimum and maximum supported values. - -This value can be changed after boot using the -/proc/sys/vm/mmap_rnd_compat_bits tunable - - -nr_hugepages -============ - -Change the minimum size of the hugepage pool. - -See Documentation/admin-guide/mm/hugetlbpage.rst - - -nr_hugepages_mempolicy -====================== - -Change the size of the hugepage pool at run-time on a specific -set of NUMA nodes. - -See Documentation/admin-guide/mm/hugetlbpage.rst - - -nr_overcommit_hugepages -======================= - -Change the maximum size of the hugepage pool. The maximum is -nr_hugepages + nr_overcommit_hugepages. - -See Documentation/admin-guide/mm/hugetlbpage.rst - - -nr_trim_pages -============= - -This is available only on NOMMU kernels. - -This value adjusts the excess page trimming behaviour of power-of-2 aligned -NOMMU mmap allocations. - -A value of 0 disables trimming of allocations entirely, while a value of 1 -trims excess pages aggressively. Any value >= 1 acts as the watermark where -trimming of allocations is initiated. - -The default value is 1. - -See Documentation/nommu-mmap.txt for more information. - - -numa_zonelist_order -=================== - -This sysctl is only for NUMA and it is deprecated. Anything but -Node order will fail! - -'where the memory is allocated from' is controlled by zonelists. - -(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. -you may be able to read ZONE_DMA as ZONE_DMA32...) - -In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following. -ZONE_NORMAL -> ZONE_DMA -This means that a memory allocation request for GFP_KERNEL will -get memory from ZONE_DMA only when ZONE_NORMAL is not available. - -In NUMA case, you can think of following 2 types of order. -Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL:: - - (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL - (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA. - -Type(A) offers the best locality for processes on Node(0), but ZONE_DMA -will be used before ZONE_NORMAL exhaustion. This increases possibility of -out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small. - -Type(B) cannot offer the best locality but is more robust against OOM of -the DMA zone. - -Type(A) is called as "Node" order. Type (B) is "Zone" order. - -"Node order" orders the zonelists by node, then by zone within each node. -Specify "[Nn]ode" for node order - -"Zone Order" orders the zonelists by zone type, then by node within each -zone. Specify "[Zz]one" for zone order. - -Specify "[Dd]efault" to request automatic configuration. - -On 32-bit, the Normal zone needs to be preserved for allocations accessible -by the kernel, so "zone" order will be selected. - -On 64-bit, devices that require DMA32/DMA are relatively rare, so "node" -order will be selected. - -Default order is recommended unless this is causing problems for your -system/application. - - -oom_dump_tasks -============== - -Enables a system-wide task dump (excluding kernel threads) to be produced -when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj -score, and name. This is helpful to determine why the OOM killer was -invoked, to identify the rogue task that caused it, and to determine why -the OOM killer chose the task it did to kill. - -If this is set to zero, this information is suppressed. On very -large systems with thousands of tasks it may not be feasible to dump -the memory state information for each one. Such systems should not -be forced to incur a performance penalty in OOM conditions when the -information may not be desired. - -If this is set to non-zero, this information is shown whenever the -OOM killer actually kills a memory-hogging task. - -The default value is 1 (enabled). - - -oom_kill_allocating_task -======================== - -This enables or disables killing the OOM-triggering task in -out-of-memory situations. - -If this is set to zero, the OOM killer will scan through the entire -tasklist and select a task based on heuristics to kill. This normally -selects a rogue memory-hogging task that frees up a large amount of -memory when killed. - -If this is set to non-zero, the OOM killer simply kills the task that -triggered the out-of-memory condition. This avoids the expensive -tasklist scan. - -If panic_on_oom is selected, it takes precedence over whatever value -is used in oom_kill_allocating_task. - -The default value is 0. - - -overcommit_kbytes -================= - -When overcommit_memory is set to 2, the committed address space is not -permitted to exceed swap plus this amount of physical RAM. See below. - -Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one -of them may be specified at a time. Setting one disables the other (which -then appears as 0 when read). - - -overcommit_memory -================= - -This value contains a flag that enables memory overcommitment. - -When this flag is 0, the kernel attempts to estimate the amount -of free memory left when userspace requests more memory. - -When this flag is 1, the kernel pretends there is always enough -memory until it actually runs out. - -When this flag is 2, the kernel uses a "never overcommit" -policy that attempts to prevent any overcommit of memory. -Note that user_reserve_kbytes affects this policy. - -This feature can be very useful because there are a lot of -programs that malloc() huge amounts of memory "just-in-case" -and don't use much of it. - -The default value is 0. - -See Documentation/vm/overcommit-accounting.rst and -mm/util.c::__vm_enough_memory() for more information. - - -overcommit_ratio -================ - -When overcommit_memory is set to 2, the committed address -space is not permitted to exceed swap plus this percentage -of physical RAM. See above. - - -page-cluster -============ - -page-cluster controls the number of pages up to which consecutive pages -are read in from swap in a single attempt. This is the swap counterpart -to page cache readahead. -The mentioned consecutivity is not in terms of virtual/physical addresses, -but consecutive on swap space - that means they were swapped out together. - -It is a logarithmic value - setting it to zero means "1 page", setting -it to 1 means "2 pages", setting it to 2 means "4 pages", etc. -Zero disables swap readahead completely. - -The default value is three (eight pages at a time). There may be some -small benefits in tuning this to a different value if your workload is -swap-intensive. - -Lower values mean lower latencies for initial faults, but at the same time -extra faults and I/O delays for following faults if they would have been part of -that consecutive pages readahead would have brought in. - - -panic_on_oom -============ - -This enables or disables panic on out-of-memory feature. - -If this is set to 0, the kernel will kill some rogue process, -called oom_killer. Usually, oom_killer can kill rogue processes and -system will survive. - -If this is set to 1, the kernel panics when out-of-memory happens. -However, if a process limits using nodes by mempolicy/cpusets, -and those nodes become memory exhaustion status, one process -may be killed by oom-killer. No panic occurs in this case. -Because other nodes' memory may be free. This means system total status -may be not fatal yet. - -If this is set to 2, the kernel panics compulsorily even on the -above-mentioned. Even oom happens under memory cgroup, the whole -system panics. - -The default value is 0. - -1 and 2 are for failover of clustering. Please select either -according to your policy of failover. - -panic_on_oom=2+kdump gives you very strong tool to investigate -why oom happens. You can get snapshot. - - -percpu_pagelist_fraction -======================== - -This is the fraction of pages at most (high mark pcp->high) in each zone that -are allocated for each per cpu page list. The min value for this is 8. It -means that we don't allow more than 1/8th of pages in each zone to be -allocated in any single per_cpu_pagelist. This entry only changes the value -of hot per cpu pagelists. User can specify a number like 100 to allocate -1/100th of each zone to each per cpu page list. - -The batch value of each per cpu pagelist is also updated as a result. It is -set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) - -The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. If the user writes '0' to this -sysctl, it will revert to this default behavior. - - -stat_interval -============= - -The time interval between which vm statistics are updated. The default -is 1 second. - - -stat_refresh -============ - -Any read or write (by root only) flushes all the per-cpu vm statistics -into their global totals, for more accurate reports when testing -e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo - -As a side-effect, it also checks for negative totals (elsewhere reported -as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. -(At time of writing, a few stats are known sometimes to be found negative, -with no ill effects: errors and warnings on these stats are suppressed.) - - -numa_stat -========= - -This interface allows runtime configuration of numa statistics. - -When page allocation performance becomes a bottleneck and you can tolerate -some possible tool breakage and decreased numa counter precision, you can -do:: - - echo 0 > /proc/sys/vm/numa_stat - -When page allocation performance is not a bottleneck and you want all -tooling to work, you can do:: - - echo 1 > /proc/sys/vm/numa_stat - - -swappiness -========== - -This control is used to define how aggressive the kernel will swap -memory pages. Higher values will increase aggressiveness, lower values -decrease the amount of swap. A value of 0 instructs the kernel not to -initiate swap until the amount of free and file-backed pages is less -than the high water mark in a zone. - -The default value is 60. - - -unprivileged_userfaultfd -======================== - -This flag controls whether unprivileged users can use the userfaultfd -system calls. Set this to 1 to allow unprivileged users to use the -userfaultfd system calls, or set this to 0 to restrict userfaultfd to only -privileged users (with SYS_CAP_PTRACE capability). - -The default value is 1. - - -user_reserve_kbytes -=================== - -When overcommit_memory is set to 2, "never overcommit" mode, reserve -min(3% of current process size, user_reserve_kbytes) of free memory. -This is intended to prevent a user from starting a single memory hogging -process, such that they cannot recover (kill the hog). - -user_reserve_kbytes defaults to min(3% of the current process size, 128MB). - -If this is reduced to zero, then the user will be allowed to allocate -all free memory with a single process, minus admin_reserve_kbytes. -Any subsequent attempts to execute a command will result in -"fork: Cannot allocate memory". - -Changing this takes effect whenever an application requests memory. - - -vfs_cache_pressure -================== - -This percentage value controls the tendency of the kernel to reclaim -the memory which is used for caching of directory and inode objects. - -At the default value of vfs_cache_pressure=100 the kernel will attempt to -reclaim dentries and inodes at a "fair" rate with respect to pagecache and -swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer -to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will -never reclaim dentries and inodes due to memory pressure and this can easily -lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 -causes the kernel to prefer to reclaim dentries and inodes. - -Increasing vfs_cache_pressure significantly beyond 100 may have negative -performance impact. Reclaim code needs to take various locks to find freeable -directory and inode objects. With vfs_cache_pressure=1000, it will look for -ten times more freeable objects than there are. - - -watermark_boost_factor -====================== - -This factor controls the level of reclaim when memory is being fragmented. -It defines the percentage of the high watermark of a zone that will be -reclaimed if pages of different mobility are being mixed within pageblocks. -The intent is that compaction has less work to do in the future and to -increase the success rate of future high-order allocations such as SLUB -allocations, THP and hugetlbfs pages. - -To make it sensible with respect to the watermark_scale_factor -parameter, the unit is in fractions of 10,000. The default value of -15,000 on !DISCONTIGMEM configurations means that up to 150% of the high -watermark will be reclaimed in the event of a pageblock being mixed due -to fragmentation. The level of reclaim is determined by the number of -fragmentation events that occurred in the recent past. If this value is -smaller than a pageblock then a pageblocks worth of pages will be reclaimed -(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. - - -watermark_scale_factor -====================== - -This factor controls the aggressiveness of kswapd. It defines the -amount of memory left in a node/system before kswapd is woken up and -how much memory needs to be free before kswapd goes back to sleep. - -The unit is in fractions of 10,000. The default value of 10 means the -distances between watermarks are 0.1% of the available memory in the -node/system. The maximum value is 1000, or 10% of memory. - -A high rate of threads entering direct reclaim (allocstall) or kswapd -going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate -that the number of free pages kswapd maintains for latency reasons is -too small for the allocation bursts occurring in the system. This knob -can then be used to tune kswapd aggressiveness accordingly. - - -zone_reclaim_mode -================= - -Zone_reclaim_mode allows someone to set more or less aggressive approaches to -reclaim memory when a zone runs out of memory. If it is set to zero then no -zone reclaim occurs. Allocations will be satisfied from other zones / nodes -in the system. - -This is value OR'ed together of - -= =================================== -1 Zone reclaim on -2 Zone reclaim writes dirty pages out -4 Zone reclaim swaps pages -= =================================== - -zone_reclaim_mode is disabled by default. For file servers or workloads -that benefit from having their data cached, zone_reclaim_mode should be -left disabled as the caching effect is likely to be more important than -data locality. - -zone_reclaim may be enabled if it's known that the workload is partitioned -such that each partition fits within a NUMA node and that accessing remote -memory would cause a measurable performance reduction. The page allocator -will then reclaim easily reusable pages (those page cache pages that are -currently not used) before allocating off node pages. - -Allowing zone reclaim to write out pages stops processes that are -writing large amounts of data from dirtying pages on other nodes. Zone -reclaim will write out dirty pages if a zone fills up and so effectively -throttle the process. This may decrease the performance of a single process -since it cannot use all of system memory to buffer the outgoing writes -anymore but it preserve the memory on other nodes so that the performance -of other processes running on other nodes will not be affected. - -Allowing regular swap effectively restricts allocations to the local -node unless explicitly overridden by memory policies or cpuset -configurations. diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index 8ba656f37cd8..109052215bce 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst @@ -439,7 +439,7 @@ Compacting MLOCKED Pages The unevictable LRU can be scanned for compactable regions and the default behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls -this behavior (see Documentation/sysctl/vm.rst). Once scanning of the +this behavior (see Documentation/admin-guide/sysctl/vm.rst). Once scanning of the unevictable LRU is enabled, the work of compaction is mostly handled by the page migration code and the same work flow as described in MIGRATING MLOCKED PAGES will apply. -- cgit From 9e1cbede267916e737c4a755059418da3ac4de95 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 15:07:43 -0300 Subject: docs: admin-guide: add laptops documentation The docs under Documentation/laptops contain users specific information. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andy Shevchenko --- Documentation/ABI/testing/sysfs-block-device | 2 +- .../ABI/testing/sysfs-platform-asus-laptop | 2 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/laptops/asus-laptop.rst | 271 ++++ .../admin-guide/laptops/disk-shock-protection.rst | 151 ++ Documentation/admin-guide/laptops/index.rst | 16 + Documentation/admin-guide/laptops/laptop-mode.rst | 781 ++++++++++ Documentation/admin-guide/laptops/lg-laptop.rst | 84 ++ Documentation/admin-guide/laptops/sony-laptop.rst | 174 +++ Documentation/admin-guide/laptops/sonypi.rst | 160 ++ .../admin-guide/laptops/thinkpad-acpi.rst | 1562 ++++++++++++++++++++ Documentation/admin-guide/laptops/toshiba_haps.rst | 87 ++ Documentation/admin-guide/sysctl/vm.rst | 4 +- Documentation/laptops/asus-laptop.rst | 271 ---- Documentation/laptops/disk-shock-protection.rst | 151 -- Documentation/laptops/index.rst | 17 - Documentation/laptops/laptop-mode.rst | 781 ---------- Documentation/laptops/lg-laptop.rst | 85 -- Documentation/laptops/sony-laptop.rst | 174 --- Documentation/laptops/sonypi.rst | 160 -- Documentation/laptops/thinkpad-acpi.rst | 1562 -------------------- Documentation/laptops/toshiba_haps.rst | 87 -- 23 files changed, 3292 insertions(+), 3293 deletions(-) create mode 100644 Documentation/admin-guide/laptops/asus-laptop.rst create mode 100644 Documentation/admin-guide/laptops/disk-shock-protection.rst create mode 100644 Documentation/admin-guide/laptops/index.rst create mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst create mode 100644 Documentation/admin-guide/laptops/lg-laptop.rst create mode 100644 Documentation/admin-guide/laptops/sony-laptop.rst create mode 100644 Documentation/admin-guide/laptops/sonypi.rst create mode 100644 Documentation/admin-guide/laptops/thinkpad-acpi.rst create mode 100644 Documentation/admin-guide/laptops/toshiba_haps.rst delete mode 100644 Documentation/laptops/asus-laptop.rst delete mode 100644 Documentation/laptops/disk-shock-protection.rst delete mode 100644 Documentation/laptops/index.rst delete mode 100644 Documentation/laptops/laptop-mode.rst delete mode 100644 Documentation/laptops/lg-laptop.rst delete mode 100644 Documentation/laptops/sony-laptop.rst delete mode 100644 Documentation/laptops/sonypi.rst delete mode 100644 Documentation/laptops/thinkpad-acpi.rst delete mode 100644 Documentation/laptops/toshiba_haps.rst (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device index 0d57bbb4fddc..17f2bc7dd261 100644 --- a/Documentation/ABI/testing/sysfs-block-device +++ b/Documentation/ABI/testing/sysfs-block-device @@ -45,7 +45,7 @@ Description: - Values below -2 are rejected with -EINVAL For more information, see - Documentation/laptops/disk-shock-protection.rst + Documentation/admin-guide/laptops/disk-shock-protection.rst What: /sys/block/*/device/ncq_prio_enable diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop index d67fa4bafa70..8b0e8205a6a2 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-laptop +++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop @@ -31,7 +31,7 @@ Description: To control the LED display, use the following : echo 0x0T000DDD > /sys/devices/platform/asus_laptop/ where T control the 3 letters display, and DDD the 3 digits display. - The DDD table can be found in Documentation/laptops/asus-laptop.rst + The DDD table can be found in Documentation/admin-guide/laptops/asus-laptop.rst What: /sys/devices/platform/asus_laptop/bluetooth Date: January 2007 diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 5c6ae1ccee1a..6fcc83aaa9b6 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -82,6 +82,7 @@ configure specific aspects of kernel behavior to your liking. perf-security acpi/index device-mapper/index + laptops/index .. only:: subproject and html diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b323f5d4366a..4821175a3769 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4347,7 +4347,7 @@ Format: sonypi.*= [HW] Sony Programmable I/O Control Device driver - See Documentation/laptops/sonypi.rst + See Documentation/admin-guide/laptops/sonypi.rst spectre_v2= [X86] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/Documentation/admin-guide/laptops/asus-laptop.rst b/Documentation/admin-guide/laptops/asus-laptop.rst new file mode 100644 index 000000000000..95176321a25a --- /dev/null +++ b/Documentation/admin-guide/laptops/asus-laptop.rst @@ -0,0 +1,271 @@ +================== +Asus Laptop Extras +================== + +Version 0.1 + +August 6, 2009 + +Corentin Chary +http://acpi4asus.sf.net/ + + This driver provides support for extra features of ACPI-compatible ASUS laptops. + It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or + VICTOR XP7210 for example). It makes all the extra buttons generate input + events (like keyboards). + + On some models adds support for changing the display brightness and output, + switching the LCD backlight on and off, and most importantly, allows you to + blink those fancy LEDs intended for reporting mail and wireless status. + +This driver supersedes the old asus_acpi driver. + +Requirements +------------ + + Kernel 2.6.X sources, configured for your computer, with ACPI support. + You also need CONFIG_INPUT and CONFIG_ACPI. + +Status +------ + + The features currently supported are the following (see below for + detailed description): + + - Fn key combinations + - Bluetooth enable and disable + - Wlan enable and disable + - GPS enable and disable + - Video output switching + - Ambient Light Sensor on and off + - LED control + - LED Display control + - LCD brightness control + - LCD on and off + + A compatibility table by model and feature is maintained on the web + site, http://acpi4asus.sf.net/. + +Usage +----- + + Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should + see some lines like this : + + Asus Laptop Extras version 0.42 + - L2D model detected. + + If it is not the output you have on your laptop, send it (and the laptop's + DSDT) to me. + + That's all, now, all the events generated by the hotkeys of your laptop + should be reported via netlink events. You can check with + "acpi_genl monitor" (part of the acpica project). + + Hotkeys are also reported as input keys (like keyboards) you can check + which key are supported using "xev" under X11. + + You can get information on the version of your DSDT table by reading the + /sys/devices/platform/asus-laptop/infos entry. If you have a question or a + bug report to do, please include the output of this entry. + +LEDs +---- + + You can modify LEDs be echoing values to `/sys/class/leds/asus/*/brightness`:: + + echo 1 > /sys/class/leds/asus::mail/brightness + + will switch the mail LED on. + + You can also know if they are on/off by reading their content and use + kernel triggers like disk-activity or heartbeat. + +Backlight +--------- + + You can control lcd backlight power and brightness with + /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15. + +Wireless devices +---------------- + + You can turn the internal Bluetooth adapter on/off with the bluetooth entry + (only on models with Bluetooth). This usually controls the associated LED. + Same for Wlan adapter. + +Display switching +----------------- + + Note: the display switching code is currently considered EXPERIMENTAL. + + Switching works for the following models: + + - L3800C + - A2500H + - L5800C + - M5200N + - W1000N (albeit with some glitches) + - M6700R + - A6JC + - F3J + + Switching doesn't work for the following: + + - M3700N + - L2X00D (locks the laptop under certain conditions) + + To switch the displays, echo values from 0 to 15 to + /sys/devices/platform/asus-laptop/display. The significance of those values + is as follows: + + +-------+-----+-----+-----+-----+-----+ + | Bin | Val | DVI | TV | CRT | LCD | + +-------+-----+-----+-----+-----+-----+ + | 0000 | 0 | | | | | + +-------+-----+-----+-----+-----+-----+ + | 0001 | 1 | | | | X | + +-------+-----+-----+-----+-----+-----+ + | 0010 | 2 | | | X | | + +-------+-----+-----+-----+-----+-----+ + | 0011 | 3 | | | X | X | + +-------+-----+-----+-----+-----+-----+ + | 0100 | 4 | | X | | | + +-------+-----+-----+-----+-----+-----+ + | 0101 | 5 | | X | | X | + +-------+-----+-----+-----+-----+-----+ + | 0110 | 6 | | X | X | | + +-------+-----+-----+-----+-----+-----+ + | 0111 | 7 | | X | X | X | + +-------+-----+-----+-----+-----+-----+ + | 1000 | 8 | X | | | | + +-------+-----+-----+-----+-----+-----+ + | 1001 | 9 | X | | | X | + +-------+-----+-----+-----+-----+-----+ + | 1010 | 10 | X | | X | | + +-------+-----+-----+-----+-----+-----+ + | 1011 | 11 | X | | X | X | + +-------+-----+-----+-----+-----+-----+ + | 1100 | 12 | X | X | | | + +-------+-----+-----+-----+-----+-----+ + | 1101 | 13 | X | X | | X | + +-------+-----+-----+-----+-----+-----+ + | 1110 | 14 | X | X | X | | + +-------+-----+-----+-----+-----+-----+ + | 1111 | 15 | X | X | X | X | + +-------+-----+-----+-----+-----+-----+ + + In most cases, the appropriate displays must be plugged in for the above + combinations to work. TV-Out may need to be initialized at boot time. + + Debugging: + + 1) Check whether the Fn+F8 key: + + a) does not lock the laptop (try a boot with noapic / nolapic if it does) + b) generates events (0x6n, where n is the value corresponding to the + configuration above) + c) actually works + + Record the disp value at every configuration. + 2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display. + Record its value, note any change. If nothing changes, try a broader range, + up to 65535. + 3) Send ANY output (both positive and negative reports are needed, unless your + machine is already listed above) to the acpi4asus-user mailing list. + + Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n + events are generated and no actual switching occurs. In such a case, a line + like:: + + echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display + + will usually do the trick ($arg is the 0000006n-like event passed to acpid). + + Note: there is currently no reliable way to read display status on xxN + (Centrino) models. + +LED display +----------- + + Some models like the W1N have a LED display that can be used to display + several items of information. + + LED display works for the following models: + + - W1000N + - W1J + + To control the LED display, use the following:: + + echo 0x0T000DDD > /sys/devices/platform/asus-laptop/ + + where T control the 3 letters display, and DDD the 3 digits display, + according to the tables below:: + + DDD (digits) + 000 to 999 = display digits + AAA = --- + BBB to FFF = turn-off + + T (type) + 0 = off + 1 = dvd + 2 = vcd + 3 = mp3 + 4 = cd + 5 = tv + 6 = cpu + 7 = vol + + For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd" + would display "DVD001". + +Driver options +-------------- + + Options can be passed to the asus-laptop driver using the standard + module argument syntax (= when passing the option to the + module or asus-laptop.= on the kernel boot line when + asus-laptop is statically linked into the kernel). + + wapf: WAPF defines the behavior of the Fn+Fx wlan key + The significance of values is yet to be found, but + most of the time: + + - 0x0 should do nothing + - 0x1 should allow to control the device with Fn+Fx key. + - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key + - 0x5 like 0x1 or 0x4 + + The default value is 0x1. + +Unsupported models +------------------ + + These models will never be supported by this module, as they use a completely + different mechanism to handle LEDs and extra stuff (meaning we have no clue + how it works): + + - ASUS A1300 (A1B), A1370D + - ASUS L7300G + - ASUS L8400 + +Patches, Errors, Questions +-------------------------- + + I appreciate any success or failure + reports, especially if they add to or correct the compatibility table. + Please include the following information in your report: + + - Asus model name + - a copy of your ACPI tables, using the "acpidump" utility + - a copy of /sys/devices/platform/asus-laptop/infos + - which driver features work and which don't + - the observed behavior of non-working features + + Any other comments or patches are also more than welcome. + + acpi4asus-user@lists.sourceforge.net + + http://sourceforge.net/projects/acpi4asus diff --git a/Documentation/admin-guide/laptops/disk-shock-protection.rst b/Documentation/admin-guide/laptops/disk-shock-protection.rst new file mode 100644 index 000000000000..e97c5f78d8c3 --- /dev/null +++ b/Documentation/admin-guide/laptops/disk-shock-protection.rst @@ -0,0 +1,151 @@ +========================== +Hard disk shock protection +========================== + +Author: Elias Oltmanns + +Last modified: 2008-10-03 + + +.. 0. Contents + + 1. Intro + 2. The interface + 3. References + 4. CREDITS + + +1. Intro +-------- + +ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature. +Issuing this command should cause the drive to switch to idle mode and +unload disk heads. This feature is being used in modern laptops in +conjunction with accelerometers and appropriate software to implement +a shock protection facility. The idea is to stop all I/O operations on +the internal hard drive and park its heads on the ramp when critical +situations are anticipated. The desire to have such a feature +available on GNU/Linux systems has been the original motivation to +implement a generic disk head parking interface in the Linux kernel. +Please note, however, that other components have to be set up on your +system in order to get disk shock protection working (see +section 3. References below for pointers to more information about +that). + + +2. The interface +---------------- + +For each ATA device, the kernel exports the file +`block/*/device/unload_heads` in sysfs (here assumed to be mounted under +/sys). Access to `/sys/block/*/device/unload_heads` is denied with +-EOPNOTSUPP if the device does not support the unload feature. +Otherwise, writing an integer value to this file will take the heads +of the respective drive off the platter and block all I/O operations +for the specified number of milliseconds. When the timeout expires and +no further disk head park request has been issued in the meantime, +normal operation will be resumed. The maximal value accepted for a +timeout is 30000 milliseconds. Exceeding this limit will return +-EOVERFLOW, but heads will be parked anyway and the timeout will be +set to 30 seconds. However, you can always change a timeout to any +value between 0 and 30000 by issuing a subsequent head park request +before the timeout of the previous one has expired. In particular, the +total timeout can exceed 30 seconds and, more importantly, you can +cancel a previously set timeout and resume normal operation +immediately by specifying a timeout of 0. Values below -2 are rejected +with -EINVAL (see below for the special meaning of -1 and -2). If the +timeout specified for a recent head park request has not yet expired, +reading from `/sys/block/*/device/unload_heads` will report the number +of milliseconds remaining until normal operation will be resumed; +otherwise, reading the unload_heads attribute will return 0. + +For example, do the following in order to park the heads of drive +/dev/sda and stop all I/O operations for five seconds:: + + # echo 5000 > /sys/block/sda/device/unload_heads + +A simple:: + + # cat /sys/block/sda/device/unload_heads + +will show you how many milliseconds are left before normal operation +will be resumed. + +A word of caution: The fact that the interface operates on a basis of +milliseconds may raise expectations that cannot be satisfied in +reality. In fact, the ATA specs clearly state that the time for an +unload operation to complete is vendor specific. The hint in ATA-7 +that this will typically be within 500 milliseconds apparently has +been dropped in ATA-8. + +There is a technical detail of this implementation that may cause some +confusion and should be discussed here. When a head park request has +been issued to a device successfully, all I/O operations on the +controller port this device is attached to will be deferred. That is +to say, any other device that may be connected to the same port will +be affected too. The only exception is that a subsequent head unload +request to that other device will be executed immediately. Further +operations on that port will be deferred until the timeout specified +for either device on the port has expired. As far as PATA (old style +IDE) configurations are concerned, there can only be two devices +attached to any single port. In SATA world we have port multipliers +which means that a user-issued head parking request to one device may +actually result in stopping I/O to a whole bunch of devices. However, +since this feature is supposed to be used on laptops and does not seem +to be very useful in any other environment, there will be mostly one +device per port. Even if the CD/DVD writer happens to be connected to +the same port as the hard drive, it generally *should* recover just +fine from the occasional buffer under-run incurred by a head park +request to the HD. Actually, when you are using an ide driver rather +than its libata counterpart (i.e. your disk is called /dev/hda +instead of /dev/sda), then parking the heads of one drive (drive X) +will generally not affect the mode of operation of another drive +(drive Y) on the same port as described above. It is only when a port +reset is required to recover from an exception on drive Y that further +I/O operations on that drive (and the reset itself) will be delayed +until drive X is no longer in the parked state. + +Finally, there are some hard drives that only comply with an earlier +version of the ATA standard than ATA-7, but do support the unload +feature nonetheless. Unfortunately, there is no safe way Linux can +detect these devices, so you won't be able to write to the +unload_heads attribute. If you know that your device really does +support the unload feature (for instance, because the vendor of your +laptop or the hard drive itself told you so), then you can tell the +kernel to enable the usage of this feature for that drive by writing +the special value -1 to the unload_heads attribute:: + + # echo -1 > /sys/block/sda/device/unload_heads + +will enable the feature for /dev/sda, and giving -2 instead of -1 will +disable it again. + + +3. References +------------- + +There are several laptops from different vendors featuring shock +protection capabilities. As manufacturers have refused to support open +source development of the required software components so far, Linux +support for shock protection varies considerably between different +hardware implementations. Ideally, this section should contain a list +of pointers at different projects aiming at an implementation of shock +protection on different systems. Unfortunately, I only know of a +single project which, although still considered experimental, is fit +for use. Please feel free to add projects that have been the victims +of my ignorance. + +- http://www.thinkwiki.org/wiki/HDAPS + + See this page for information about Linux support of the hard disk + active protection system as implemented in IBM/Lenovo Thinkpads. + + +4. CREDITS +---------- + +This implementation of disk head parking has been inspired by a patch +originally published by Jon Escombe . My efforts +to develop an implementation of this feature that is fit to be merged +into mainline have been aided by various kernel developers, in +particular by Tejun Heo and Bartlomiej Zolnierkiewicz. diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst new file mode 100644 index 000000000000..6b554e39863b --- /dev/null +++ b/Documentation/admin-guide/laptops/index.rst @@ -0,0 +1,16 @@ + +============== +Laptop Drivers +============== + +.. toctree:: + :maxdepth: 1 + + asus-laptop + disk-shock-protection + laptop-mode + lg-laptop + sony-laptop + sonypi + thinkpad-acpi + toshiba_haps diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst new file mode 100644 index 000000000000..c984c4262f2e --- /dev/null +++ b/Documentation/admin-guide/laptops/laptop-mode.rst @@ -0,0 +1,781 @@ +=============================================== +How to conserve battery power using laptop-mode +=============================================== + +Document Author: Bart Samwel (bart@samwel.tk) + +Date created: January 2, 2004 + +Last modified: December 06, 2004 + +Introduction +------------ + +Laptop mode is used to minimize the time that the hard disk needs to be spun up, +to conserve battery power on laptops. It has been reported to cause significant +power savings. + +.. Contents + + * Introduction + * Installation + * Caveats + * The Details + * Tips & Tricks + * Control script + * ACPI integration + * Monitoring tool + + +Installation +------------ + +To use laptop mode, you don't need to set any kernel configuration options +or anything. Simply install all the files included in this document, and +laptop mode will automatically be started when you're on battery. For +your convenience, a tarball containing an installer can be downloaded at: + + http://www.samwel.tk/laptop_mode/laptop_mode/ + +To configure laptop mode, you need to edit the configuration file, which is +located in /etc/default/laptop-mode on Debian-based systems, or in +/etc/sysconfig/laptop-mode on other systems. + +Unfortunately, automatic enabling of laptop mode does not work for +laptops that don't have ACPI. On those laptops, you need to start laptop +mode manually. To start laptop mode, run "laptop_mode start", and to +stop it, run "laptop_mode stop". (Note: The laptop mode tools package now +has experimental support for APM, you might want to try that first.) + + +Caveats +------- + +* The downside of laptop mode is that you have a chance of losing up to 10 + minutes of work. If you cannot afford this, don't use it! The supplied ACPI + scripts automatically turn off laptop mode when the battery almost runs out, + so that you won't lose any data at the end of your battery life. + +* Most desktop hard drives have a very limited lifetime measured in spindown + cycles, typically about 50.000 times (it's usually listed on the spec sheet). + Check your drive's rating, and don't wear down your drive's lifetime if you + don't need to. + +* If you mount some of your ext3/reiserfs filesystems with the -n option, then + the control script will not be able to remount them correctly. You must set + DO_REMOUNTS=0 in the control script, otherwise it will remount them with the + wrong options -- or it will fail because it cannot write to /etc/mtab. + +* If you have your filesystems listed as type "auto" in fstab, like I did, then + the control script will not recognize them as filesystems that need remounting. + You must list the filesystems with their true type instead. + +* It has been reported that some versions of the mutt mail client use file access + times to determine whether a folder contains new mail. If you use mutt and + experience this, you must disable the noatime remounting by setting the option + DO_REMOUNT_NOATIME to 0 in the configuration file. + + +The Details +----------- + +Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is +present for all kernels that have the laptop mode patch, regardless of any +configuration options. When the knob is set, any physical disk I/O (that might +have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The +result of this is that after a disk has spun down, it will not be spun up +anymore to write dirty blocks, because those blocks had already been written +immediately after the most recent read operation. The value of the laptop_mode +knob determines the time between the occurrence of disk I/O and when the flush +is triggered. A sensible value for the knob is 5 seconds. Setting the knob to +0 disables laptop mode. + +To increase the effectiveness of the laptop_mode strategy, the laptop_mode +control script increases dirty_expire_centisecs and dirty_writeback_centisecs in +/proc/sys/vm to about 10 minutes (by default), which means that pages that are +dirtied are not forced to be written to disk as often. The control script also +changes the dirty background ratio, so that background writeback of dirty pages +is not done anymore. Combined with a higher commit value (also 10 minutes) for +ext3 or ReiserFS filesystems (also done automatically by the control script), +this results in concentration of disk activity in a small time interval which +occurs only once every 10 minutes, or whenever the disk is forced to spin up by +a cache miss. The disk can then be spun down in the periods of inactivity. + +If you want to find out which process caused the disk to spin up, you can +gather information by setting the flag /proc/sys/vm/block_dump. When this flag +is set, Linux reports all disk read and write operations that take place, and +all block dirtyings done to files. This makes it possible to debug why a disk +needs to spin up, and to increase battery life even more. The output of +block_dump is written to the kernel output, and it can be retrieved using +"dmesg". When you use block_dump and your kernel logging level also includes +kernel debugging messages, you probably want to turn off klogd, otherwise +the output of block_dump will be logged, causing disk activity that is not +normally there. + + +Configuration +------------- + +The laptop mode configuration file is located in /etc/default/laptop-mode on +Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It +contains the following options: + +MAX_AGE: + +Maximum time, in seconds, of hard drive spindown time that you are +comfortable with. Worst case, it's possible that you could lose this +amount of work if your battery fails while you're in laptop mode. + +MINIMUM_BATTERY_MINUTES: + +Automatically disable laptop mode if the remaining number of minutes of +battery power is less than this value. Default is 10 minutes. + +AC_HD/BATT_HD: + +The idle timeout that should be set on your hard drive when laptop mode +is active (BATT_HD) and when it is not active (AC_HD). The defaults are +20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The +possible values are those listed in the manual page for "hdparm" for the +"-S" option. + +HD: + +The devices for which the spindown timeout should be adjusted by laptop mode. +Default is /dev/hda. If you specify multiple devices, separate them by a space. + +READAHEAD: + +Disk readahead, in 512-byte sectors, while laptop mode is active. A large +readahead can prevent disk accesses for things like executable pages (which are +loaded on demand while the application executes) and sequentially accessed data +(MP3s). + +DO_REMOUNTS: + +The control script automatically remounts any mounted journaled filesystems +with appropriate commit interval options. When this option is set to 0, this +feature is disabled. + +DO_REMOUNT_NOATIME: + +When remounting, should the filesystems be remounted with the noatime option? +Normally, this is set to "1" (enabled), but there may be programs that require +access time recording. + +DIRTY_RATIO: + +The percentage of memory that is allowed to contain "dirty" or unsaved data +before a writeback is forced, while laptop mode is active. Corresponds to +the /proc/sys/vm/dirty_ratio sysctl. + +DIRTY_BACKGROUND_RATIO: + +The percentage of memory that is allowed to contain "dirty" or unsaved data +after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set +this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio +sysctl. + +Note that the behaviour of dirty_background_ratio is quite different +when laptop mode is active and when it isn't. When laptop mode is inactive, +dirty_background_ratio is the threshold percentage at which background writeouts +start taking place. When laptop mode is active, however, background writeouts +are disabled, and the dirty_background_ratio only determines how much writeback +is done when dirty_ratio is reached. + +DO_CPU: + +Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. +See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) + +CPU_MAXFREQ: + +When on battery, what is the maximum CPU speed that the system should use? Legal +values are "slowest" for the slowest speed that your CPU is able to operate at, +or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. + + +Tips & Tricks +------------- + +* Bartek Kania reports getting up to 50 minutes of extra battery life (on top + of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). + +* You can spin down the disk while playing MP3, by setting disk readahead + to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at + once, and will then spin down while the MP3 is playing. (Thanks to Bartek + Kania.) + +* Drew Scott Daniels observed: "I don't know why, but when I decrease the number + of colours that my display uses it consumes less battery power. I've seen + this on powerbooks too. I hope that this is a piece of information that + might be useful to the Laptop Mode patch or its users." + +* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the + file after every logging. When you're using laptop-mode and your disk doesn't + spin down, this is a likely culprit. + +* Richard Atterer observed that laptop mode does not work well with noflushd + (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode + from doing its thing. + +* If you're worried about your data, you might want to consider using a USB + memory stick or something like that as a "working area". (Be aware though + that flash memory can only handle a limited number of writes, and overuse + may wear out your memory stick pretty quickly. Do _not_ use journalling + filesystems on flash memory sticks.) + + +Configuration file for control and ACPI battery scripts +------------------------------------------------------- + +This allows the tunables to be changed for the scripts via an external +configuration file + +It should be installed as /etc/default/laptop-mode on Debian, and as +/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. + +Config file:: + + # Maximum time, in seconds, of hard drive spindown time that you are + # comfortable with. Worst case, it's possible that you could lose this + # amount of work if your battery fails you while in laptop mode. + #MAX_AGE=600 + + # Automatically disable laptop mode when the number of minutes of battery + # that you have left goes below this threshold. + MINIMUM_BATTERY_MINUTES=10 + + # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG + # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk + # will read a complete MP3 at once, and will then spin down while the MP3/OGG is + # playing. + #READAHEAD=4096 + + # Shall we remount journaled fs. with appropriate commit interval? (1=yes) + #DO_REMOUNTS=1 + + # And shall we add the "noatime" option to that as well? (1=yes) + #DO_REMOUNT_NOATIME=1 + + # Dirty synchronous ratio. At this percentage of dirty pages the process + # which + # calls write() does its own writeback + #DIRTY_RATIO=40 + + # + # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been + # exceeded, the kernel will wake flusher threads which will then reduce the + # amount of dirty memory to dirty_background_ratio. Set this nice and low, + # so once some writeout has commenced, we do a lot of it. + # + #DIRTY_BACKGROUND_RATIO=5 + + # kernel default dirty buffer age + #DEF_AGE=30 + #DEF_UPDATE=5 + #DEF_DIRTY_BACKGROUND_RATIO=10 + #DEF_DIRTY_RATIO=40 + #DEF_XFS_AGE_BUFFER=15 + #DEF_XFS_SYNC_INTERVAL=30 + #DEF_XFS_BUFD_INTERVAL=1 + + # This must be adjusted manually to the value of HZ in the running kernel + # on 2.4, until the XFS people change their 2.4 external interfaces to work in + # centisecs. This can be automated, but it's a work in progress that still + # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for + # external interfaces, and that is currently always set to 100. So you don't + # need to change this on 2.6. + #XFS_HZ=100 + + # Should the maximum CPU frequency be adjusted down while on battery? + # Requires CPUFreq to be setup. + # See Documentation/admin-guide/pm/cpufreq.rst for more info + #DO_CPU=0 + + # When on battery what is the maximum CPU speed that the system should + # use? Legal values are "slowest" for the slowest speed that your + # CPU is able to operate at, or a value listed in: + # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies + # Only applicable if DO_CPU=1. + #CPU_MAXFREQ=slowest + + # Idle timeout for your hard drive (man hdparm for valid values, -S option) + # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). + #AC_HD=244 + #BATT_HD=4 + + # The drives for which to adjust the idle timeout. Separate them by a space, + # e.g. HD="/dev/hda /dev/hdb". + #HD="/dev/hda" + + # Set the spindown timeout on a hard drive? + #DO_HD=1 + + +Control script +-------------- + +Please note that this control script works for the Linux 2.4 and 2.6 series (thanks +to Kiko Piris). + +Control script:: + + #!/bin/bash + + # start or stop laptop_mode, best run by a power management daemon when + # ac gets connected/disconnected from a laptop + # + # install as /sbin/laptop_mode + # + # Contributors to this script: Kiko Piris + # Bart Samwel + # Micha Feigin + # Andrew Morton + # Herve Eychenne + # Dax Kelson + # + # Original Linux 2.4 version by: Jens Axboe + + ############################################################################# + + # Source config + if [ -f /etc/default/laptop-mode ] ; then + # Debian + . /etc/default/laptop-mode + elif [ -f /etc/sysconfig/laptop-mode ] ; then + # Others + . /etc/sysconfig/laptop-mode + fi + + # Don't raise an error if the config file is incomplete + # set defaults instead: + + # Maximum time, in seconds, of hard drive spindown time that you are + # comfortable with. Worst case, it's possible that you could lose this + # amount of work if your battery fails you while in laptop mode. + MAX_AGE=${MAX_AGE:-'600'} + + # Read-ahead, in kilobytes + READAHEAD=${READAHEAD:-'4096'} + + # Shall we remount journaled fs. with appropriate commit interval? (1=yes) + DO_REMOUNTS=${DO_REMOUNTS:-'1'} + + # And shall we add the "noatime" option to that as well? (1=yes) + DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} + + # Shall we adjust the idle timeout on a hard drive? + DO_HD=${DO_HD:-'1'} + + # Adjust idle timeout on which hard drive? + HD="${HD:-'/dev/hda'}" + + # spindown time for HD (hdparm -S values) + AC_HD=${AC_HD:-'244'} + BATT_HD=${BATT_HD:-'4'} + + # Dirty synchronous ratio. At this percentage of dirty pages the process which + # calls write() does its own writeback + DIRTY_RATIO=${DIRTY_RATIO:-'40'} + + # cpu frequency scaling + # See Documentation/admin-guide/pm/cpufreq.rst for more info + DO_CPU=${CPU_MANAGE:-'0'} + CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} + + # + # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been + # exceeded, the kernel will wake flusher threads which will then reduce the + # amount of dirty memory to dirty_background_ratio. Set this nice and low, + # so once some writeout has commenced, we do a lot of it. + # + DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} + + # kernel default dirty buffer age + DEF_AGE=${DEF_AGE:-'30'} + DEF_UPDATE=${DEF_UPDATE:-'5'} + DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} + DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} + DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} + DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} + DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} + + # This must be adjusted manually to the value of HZ in the running kernel + # on 2.4, until the XFS people change their 2.4 external interfaces to work in + # centisecs. This can be automated, but it's a work in progress that still needs + # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external + # interfaces, and that is currently always set to 100. So you don't need to + # change this on 2.6. + XFS_HZ=${XFS_HZ:-'100'} + + ############################################################################# + + KLEVEL="$(uname -r | + { + IFS='.' read a b c + echo $a.$b + } + )" + case "$KLEVEL" in + "2.4"|"2.6") + ;; + *) + echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 + exit 1 + ;; + esac + + if [ ! -e /proc/sys/vm/laptop_mode ] ; then + echo "Kernel is not patched with laptop_mode patch." >&2 + exit 1 + fi + + if [ ! -w /proc/sys/vm/laptop_mode ] ; then + echo "You do not have enough privileges to enable laptop_mode." >&2 + exit 1 + fi + + # Remove an option (the first parameter) of the form option= from + # a mount options string (the rest of the parameters). + parse_mount_opts () { + OPT="$1" + shift + echo ",$*," | sed \ + -e 's/,'"$OPT"'=[0-9]*,/,/g' \ + -e 's/,,*/,/g' \ + -e 's/^,//' \ + -e 's/,$//' + } + + # Remove an option (the first parameter) without any arguments from + # a mount option string (the rest of the parameters). + parse_nonumber_mount_opts () { + OPT="$1" + shift + echo ",$*," | sed \ + -e 's/,'"$OPT"',/,/g' \ + -e 's/,,*/,/g' \ + -e 's/^,//' \ + -e 's/,$//' + } + + # Find out the state of a yes/no option (e.g. "atime"/"noatime") in + # fstab for a given filesystem, and use this state to replace the + # value of the option in another mount options string. The device + # is the first argument, the option name the second, and the default + # value the third. The remainder is the mount options string. + # + # Example: + # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime + # + # If fstab contains, say, "rw" for this filesystem, then the result + # will be "defaults,atime". + parse_yesno_opts_wfstab () { + L_DEV="$1" + OPT="$2" + DEF_OPT="$3" + shift 3 + L_OPTS="$*" + PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" + PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" + # Watch for a default atime in fstab + FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" + if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then + # option specified in fstab: extract the value and use it + if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then + echo "$PARSEDOPTS1,no$OPT" + else + # no$OPT not found -- so we must have $OPT. + echo "$PARSEDOPTS1,$OPT" + fi + else + # option not specified in fstab -- choose the default. + echo "$PARSEDOPTS1,$DEF_OPT" + fi + } + + # Find out the state of a numbered option (e.g. "commit=NNN") in + # fstab for a given filesystem, and use this state to replace the + # value of the option in another mount options string. The device + # is the first argument, and the option name the second. The + # remainder is the mount options string in which the replacement + # must be done. + # + # Example: + # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 + # + # If fstab contains, say, "commit=3,rw" for this filesystem, then the + # result will be "rw,commit=3". + parse_mount_opts_wfstab () { + L_DEV="$1" + OPT="$2" + shift 2 + L_OPTS="$*" + PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" + # Watch for a default commit in fstab + FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" + if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then + # option specified in fstab: extract the value, and use it + echo -n "$PARSEDOPTS1,$OPT=" + echo ",$FSTAB_OPTS," | sed \ + -e 's/.*,'"$OPT"'=//' \ + -e 's/,.*//' + else + # option not specified in fstab: set it to 0 + echo "$PARSEDOPTS1,$OPT=0" + fi + } + + deduce_fstype () { + MP="$1" + # My root filesystem unfortunately has + # type "unknown" in /etc/mtab. If we encounter + # "unknown", we try to get the type from fstab. + cat /etc/fstab | + grep -v '^#' | + while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do + if [ "$FSTAB_MP" = "$MP" ]; then + echo $FSTAB_FST + exit 0 + fi + done + } + + if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then + NOATIME_OPT=",noatime" + fi + + case "$1" in + start) + AGE=$((100*$MAX_AGE)) + XFS_AGE=$(($XFS_HZ*$MAX_AGE)) + echo -n "Starting laptop_mode" + + if [ -d /proc/sys/vm/pagebuf ] ; then + # (For 2.4 and early 2.6.) + # This only needs to be set, not reset -- it is only used when + # laptop mode is enabled. + echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age + echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval + elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then + # (A couple of early 2.6 laptop mode patches had these.) + # The same goes for these. + echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer + echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval + elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then + # (2.6.6) + # But not for these -- they are also used in normal + # operation. + echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer + echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval + elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then + # (2.6.7 upwards) + # And not for these either. These are in centisecs, + # not USER_HZ, so we have to use $AGE, not $XFS_AGE. + echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs + echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs + echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs + fi + + case "$KLEVEL" in + "2.4") + echo 1 > /proc/sys/vm/laptop_mode + echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo 5 > /proc/sys/vm/laptop_mode + echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ]; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + PARSEDOPTS="$(parse_mount_opts "$OPTS")" + if [ "$FST" = 'unknown' ]; then + FST=$(deduce_fstype $MP) + fi + case "$FST" in + "ext3"|"reiserfs") + PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT + ;; + "xfs") + mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT + ;; + esac + if [ -b $DEV ] ; then + blockdev --setra $(($READAHEAD * 2)) $DEV + fi + done + fi + if [ $DO_HD -eq 1 ] ; then + for THISHD in $HD ; do + /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 + /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 + done + fi + if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then + if [ $CPU_MAXFREQ = 'slowest' ]; then + CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` + fi + echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq + fi + echo "." + ;; + stop) + U_AGE=$((100*$DEF_UPDATE)) + B_AGE=$((100*$DEF_AGE)) + echo -n "Stopping laptop_mode" + echo 0 > /proc/sys/vm/laptop_mode + if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then + # These need to be restored, if there are no lm_*. + echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer + echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval + elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then + # These need to be restored as well. + echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs + echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs + echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs + fi + case "$KLEVEL" in + "2.4") + echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ] ; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + # Reset commit and atime options to defaults. + if [ "$FST" = 'unknown' ]; then + FST=$(deduce_fstype $MP) + fi + case "$FST" in + "ext3"|"reiserfs") + PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" + PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS + ;; + "xfs") + PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS + ;; + esac + if [ -b $DEV ] ; then + blockdev --setra 256 $DEV + fi + done + fi + if [ $DO_HD -eq 1 ] ; then + for THISHD in $HD ; do + /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 + /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 + done + fi + if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then + echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq + fi + echo "." + ;; + *) + echo "Usage: $0 {start|stop}" 2>&1 + exit 1 + ;; + + esac + + exit 0 + + +ACPI integration +---------------- + +Dax Kelson submitted this so that the ACPI acpid daemon will +kick off the laptop_mode script and run hdparm. The part that +automatically disables laptop mode when the battery is low was +written by Jan Topinski. + +/etc/acpi/events/ac_adapter:: + + event=ac_adapter + action=/etc/acpi/actions/ac.sh %e + +/etc/acpi/events/battery:: + + event=battery.* + action=/etc/acpi/actions/battery.sh %e + +/etc/acpi/actions/ac.sh:: + + #!/bin/bash + + # ac on/offline event handler + + status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` + + case $status in + "on-line") + /sbin/laptop_mode stop + exit 0 + ;; + "off-line") + /sbin/laptop_mode start + exit 0 + ;; + esac + + +/etc/acpi/actions/battery.sh:: + + #! /bin/bash + + # Automatically disable laptop mode when the battery almost runs out. + + BATT_INFO=/proc/acpi/battery/$2/state + + if [[ -f /proc/sys/vm/laptop_mode ]] + then + LM=`cat /proc/sys/vm/laptop_mode` + if [[ $LM -gt 0 ]] + then + if [[ -f $BATT_INFO ]] + then + # Source the config file only now that we know we need + if [ -f /etc/default/laptop-mode ] ; then + # Debian + . /etc/default/laptop-mode + elif [ -f /etc/sysconfig/laptop-mode ] ; then + # Others + . /etc/sysconfig/laptop-mode + fi + MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} + + ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" + if [[ ACTION -eq "discharging" ]] + then + PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` + REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` + fi + if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) + then + /sbin/laptop_mode stop + fi + else + logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." + fi + fi + fi + + +Monitoring tool +--------------- + +Bartek Kania submitted this, it can be used to measure how much time your disk +spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst new file mode 100644 index 000000000000..ce9b14671cb9 --- /dev/null +++ b/Documentation/admin-guide/laptops/lg-laptop.rst @@ -0,0 +1,84 @@ +.. SPDX-License-Identifier: GPL-2.0+ + + +LG Gram laptop extra features +============================= + +By Matan Ziv-Av + + +Hotkeys +------- + +The following FN keys are ignored by the kernel without this driver: + +- FN-F1 (LG control panel) - Generates F15 +- FN-F5 (Touchpad toggle) - Generates F13 +- FN-F6 (Airplane mode) - Generates RFKILL +- FN-F8 (Keyboard backlight) - Generates F16. + This key also changes keyboard backlight mode. +- FN-F9 (Reader mode) - Generates F14 + +The rest of the FN keys work without a need for a special driver. + + +Reader mode +----------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/reader_mode disables/enables +reader mode. In this mode the screen colors change (blue color reduced), +and the reader mode indicator LED (on F9 key) turns on. + + +FN Lock +------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/fn_lock disables/enables +FN lock. + + +Battery care limit +------------------ + +Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit +sets the maximum capacity to charge the battery. Limiting the charge +reduces battery capacity loss over time. + +This value is reset to 100 when the kernel boots. + + +Fan mode +-------- + +Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables +the fan silent mode. + + +USB charge +---------- + +Writing 0/1 to /sys/devices/platform/lg-laptop/usb_charge disables/enables +charging another device from the USB port while the device is turned off. + +This value is reset to 0 when the kernel boots. + + +LEDs +~~~~ + +The are two LED devices supported by the driver: + +Keyboard backlight +------------------ + +A led device named kbd_led controls the keyboard backlight. There are three +lighting level: off (0), low (127) and high (255). + +The keyboard backlight is also controlled by the key combination FN-F8 +which cycles through those levels. + + +Touchpad indicator LED +---------------------- + +On the F5 key. Controlled by led device names tpad_led. diff --git a/Documentation/admin-guide/laptops/sony-laptop.rst b/Documentation/admin-guide/laptops/sony-laptop.rst new file mode 100644 index 000000000000..9edcc7f6612f --- /dev/null +++ b/Documentation/admin-guide/laptops/sony-laptop.rst @@ -0,0 +1,174 @@ +========================================= +Sony Notebook Control Driver (SNC) Readme +========================================= + + - Copyright (C) 2004- 2005 Stelian Pop + - Copyright (C) 2007 Mattia Dongili + +This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the +Sony Vaio laptops. This driver mixes both devices functions under the same +(hopefully consistent) interface. This also means that the sonypi driver is +obsoleted by sony-laptop now. + +Fn keys (hotkeys): +------------------ + +Some models report hotkeys through the SNC or SPIC devices, such events are +reported both through the ACPI subsystem as acpi events and through the INPUT +subsystem. See the logs of /proc/bus/input/devices to find out what those +events are and which input devices are created by the driver. +Additionally, loading the driver with the debug option will report all events +in the kernel log. + +The "scancodes" passed to the input system (that can be remapped with udev) +are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c +module. For example the "FN/E" key combination (EJECTCD on some models) +generates the scancode 20 (0x14). + +Backlight control: +------------------ +If your laptop model supports it, you will find sysfs files in the +/sys/class/backlight/sony/ +directory. You will be able to query and set the current screen +brightness: + + ====================== ========================================= + brightness get/set screen brightness (an integer + between 0 and 7) + actual_brightness reading from this file will query the HW + to get real brightness value + max_brightness the maximum brightness value + ====================== ========================================= + + +Platform specific: +------------------ +Loading the sony-laptop module will create a +/sys/devices/platform/sony-laptop/ +directory populated with some files. + +You then read/write integer values from/to those files by using +standard UNIX tools. + +The files are: + + ====================== ========================================== + brightness_default screen brightness which will be set + when the laptop will be rebooted + cdpower power on/off the internal CD drive + audiopower power on/off the internal sound card + lanpower power on/off the internal ethernet card + (only in debug mode) + bluetoothpower power on/off the internal bluetooth device + fanspeed get/set the fan speed + ====================== ========================================== + +Note that some files may be missing if they are not supported +by your particular laptop model. + +Example usage:: + + # echo "1" > /sys/devices/platform/sony-laptop/brightness_default + +sets the lowest screen brightness for the next and later reboots + +:: + + # echo "8" > /sys/devices/platform/sony-laptop/brightness_default + +sets the highest screen brightness for the next and later reboots + +:: + + # cat /sys/devices/platform/sony-laptop/brightness_default + +retrieves the value + +:: + + # echo "0" > /sys/devices/platform/sony-laptop/audiopower + +powers off the sound card + +:: + + # echo "1" > /sys/devices/platform/sony-laptop/audiopower + +powers on the sound card. + + +RFkill control: +--------------- +More recent Vaio models expose a consistent set of ACPI methods to +control radio frequency emitting devices. If you are a lucky owner of +such a laptop you will find the necessary rfkill devices under +/sys/class/rfkill. Check those starting with sony-* in:: + + # grep . /sys/class/rfkill/*/{state,name} + + +Development: +------------ + +If you want to help with the development of this driver (and +you are not afraid of any side effects doing strange things with +your ACPI BIOS could have on your laptop), load the driver and +pass the option 'debug=1'. + +REPEAT: + **DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.** + +In your kernel logs you will find the list of all ACPI methods +the SNC device has on your laptop. + +* For new models you will see a long list of meaningless method names, + reading the DSDT table source should reveal that: + +(1) the SNC device uses an internal capability lookup table +(2) SN00 is used to find values in the lookup table +(3) SN06 and SN07 are used to call into the real methods based on + offsets you can obtain iterating the table using SN00 +(4) SN02 used to enable events. + +Some values in the capability lookup table are more or less known, see +the code for all sony_call_snc_handle calls, others are more obscure. + +* For old models you can see the GCDP/GCDP methods used to pwer on/off + the CD drive, but there are others and they are usually different from + model to model. + +**I HAVE NO IDEA WHAT THOSE METHODS DO.** + +The sony-laptop driver creates, for some of those methods (the most +current ones found on several Vaio models), an entry under +/sys/devices/platform/sony-laptop, just like the 'cdpower' one. +You can create other entries corresponding to your own laptop methods by +further editing the source (see the 'sony_nc_values' table, and add a new +entry to this table with your get/set method names using the +SNC_HANDLE_NAMES macro). + +Your mission, should you accept it, is to try finding out what +those entries are for, by reading/writing random values from/to those +files and find out what is the impact on your laptop. + +Should you find anything interesting, please report it back to me, +I will not disavow all knowledge of your actions :) + +See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other +useful info. + +Bugs/Limitations: +----------------- + +* This driver is not based on official documentation from Sony + (because there is none), so there is no guarantee this driver + will work at all, or do the right thing. Although this hasn't + happened to me, this driver could do very bad things to your + laptop, including permanent damage. + +* The sony-laptop and sonypi drivers do not interact at all. In the + future, sonypi will be removed and replaced by sony-laptop. + +* spicctrl, which is the userspace tool used to communicate with the + sonypi driver (through /dev/sonypi) is deprecated as well since all + its features are now available under the sysfs tree via sony-laptop. diff --git a/Documentation/admin-guide/laptops/sonypi.rst b/Documentation/admin-guide/laptops/sonypi.rst new file mode 100644 index 000000000000..2a1975ed7ee4 --- /dev/null +++ b/Documentation/admin-guide/laptops/sonypi.rst @@ -0,0 +1,160 @@ +================================================== +Sony Programmable I/O Control Device Driver Readme +================================================== + + - Copyright (C) 2001-2004 Stelian Pop + - Copyright (C) 2001-2002 Alcôve + - Copyright (C) 2001 Michael Ashley + - Copyright (C) 2001 Junichi Morita + - Copyright (C) 2000 Takaya Kinjo + - Copyright (C) 2000 Andrew Tridgell + +This driver enables access to the Sony Programmable I/O Control Device which +can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be +limited to new FX series laptops, at least the FX501 and the FX702) lack a +sonypi device and are not supported at all by this driver. + +It will give access (through a user space utility) to some events those laptops +generate, like: + + - jogdial events (the small wheel on the side of Vaios) + - capture button events (only on Vaio Picturebook series) + - Fn keys + - bluetooth button (only on C1VR model) + - programmable keys, back, help, zoom, thumbphrase buttons, etc. + (when available) + +Those events (see linux/sonypi.h) can be polled using the character device node +/dev/sonypi (major 10, minor auto allocated or specified as a option). +A simple daemon which translates the jogdial movements into mouse wheel events +can be downloaded at: + +Another option to intercept the events is to get them directly through the +input layer. + +This driver supports also some ioctl commands for setting the LCD screen +brightness and querying the batteries charge information (some more +commands may be added in the future). + +This driver can also be used to set the camera controls on Picturebook series +(brightness, contrast etc), and is used by the video4linux driver for the +Motion Eye camera. + +Please note that this driver was created by reverse engineering the Windows +driver and the ACPI BIOS, because Sony doesn't agree to release any programming +specs for its laptops. If someone convinces them to do so, drop me a note. + +Driver options: +--------------- + +Several options can be passed to the sonypi driver using the standard +module argument syntax (= when passing the option to the +module or sonypi.= on the kernel boot line when sonypi is +statically linked into the kernel). Those options are: + + =============== ======================================================= + minor: minor number of the misc device /dev/sonypi, + default is -1 (automatic allocation, see /proc/misc + or kernel logs) + + camera: if you have a PictureBook series Vaio (with the + integrated MotionEye camera), set this parameter to 1 + in order to let the driver access to the camera + + fnkeyinit: on some Vaios (C1VE, C1VR etc), the Fn key events don't + get enabled unless you set this parameter to 1. + Do not use this option unless it's actually necessary, + some Vaio models don't deal well with this option. + This option is available only if the kernel is + compiled without ACPI support (since it conflicts + with it and it shouldn't be required anyway if + ACPI is already enabled). + + verbose: set to 1 to print unknown events received from the + sonypi device. + set to 2 to print all events received from the + sonypi device. + + compat: uses some compatibility code for enabling the sonypi + events. If the driver worked for you in the past + (prior to version 1.5) and does not work anymore, + add this option and report to the author. + + mask: event mask telling the driver what events will be + reported to the user. This parameter is required for + some Vaio models where the hardware reuses values + used in other Vaio models (like the FX series who does + not have a jogdial but reuses the jogdial events for + programmable keys events). The default event mask is + set to 0xffffffff, meaning that all possible events + will be tried. You can use the following bits to + construct your own event mask (from + drivers/char/sonypi.h): + + ======================== ====== + SONYPI_JOGGER_MASK 0x0001 + SONYPI_CAPTURE_MASK 0x0002 + SONYPI_FNKEY_MASK 0x0004 + SONYPI_BLUETOOTH_MASK 0x0008 + SONYPI_PKEY_MASK 0x0010 + SONYPI_BACK_MASK 0x0020 + SONYPI_HELP_MASK 0x0040 + SONYPI_LID_MASK 0x0080 + SONYPI_ZOOM_MASK 0x0100 + SONYPI_THUMBPHRASE_MASK 0x0200 + SONYPI_MEYE_MASK 0x0400 + SONYPI_MEMORYSTICK_MASK 0x0800 + SONYPI_BATTERY_MASK 0x1000 + SONYPI_WIRELESS_MASK 0x2000 + ======================== ====== + + useinput: if set (which is the default) two input devices are + created, one which interprets the jogdial events as + mouse events, the other one which acts like a + keyboard reporting the pressing of the special keys. + =============== ======================================================= + +Module use: +----------- + +In order to automatically load the sonypi module on use, you can put those +lines a configuration file in /etc/modprobe.d/:: + + alias char-major-10-250 sonypi + options sonypi minor=250 + +This supposes the use of minor 250 for the sonypi device:: + + # mknod /dev/sonypi c 10 250 + +Bugs: +----- + + - several users reported that this driver disables the BIOS-managed + Fn-keys which put the laptop in sleeping state, or switch the + external monitor on/off. There is no workaround yet, since this + driver disables all APM management for those keys, by enabling the + ACPI management (and the ACPI core stuff is not complete yet). If + you have one of those laptops with working Fn keys and want to + continue to use them, don't use this driver. + + - some users reported that the laptop speed is lower (dhrystone + tested) when using the driver with the fnkeyinit parameter. I cannot + reproduce it on my laptop and not all users have this problem. + This happens because the fnkeyinit parameter enables the ACPI + mode (but without additional ACPI control, like processor + speed handling etc). Use ACPI instead of APM if it works on your + laptop. + + - sonypi lacks the ability to distinguish between certain key + events on some models. + + - some models with the nvidia card (geforce go 6200 tc) uses a + different way to adjust the backlighting of the screen. There + is a userspace utility to adjust the brightness on those models, + which can be downloaded from + http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2 + + - since all development was done by reverse engineering, there is + *absolutely no guarantee* that this driver will not crash your + laptop. Permanently. diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst new file mode 100644 index 000000000000..19d52fc3c5e9 --- /dev/null +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -0,0 +1,1562 @@ +=========================== +ThinkPad ACPI Extras Driver +=========================== + +Version 0.25 + +October 16th, 2013 + +- Borislav Deianov +- Henrique de Moraes Holschuh + +http://ibm-acpi.sf.net/ + +This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It +supports various features of these laptops which are accessible +through the ACPI and ACPI EC framework, but not otherwise fully +supported by the generic Linux ACPI drivers. + +This driver used to be named ibm-acpi until kernel 2.6.21 and release +0.13-20070314. It used to be in the drivers/acpi tree, but it was +moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel +2.6.22, and release 0.14. It was moved to drivers/platform/x86 for +kernel 2.6.29 and release 0.22. + +The driver is named "thinkpad-acpi". In some places, like module +names and log messages, "thinkpad_acpi" is used because of userspace +issues. + +"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too +long due to length limitations on some Linux kernel versions. + +Status +------ + +The features currently supported are the following (see below for +detailed description): + + - Fn key combinations + - Bluetooth enable and disable + - video output switching, expansion control + - ThinkLight on and off + - CMOS/UCMS control + - LED control + - ACPI sounds + - temperature sensors + - Experimental: embedded controller register dump + - LCD brightness control + - Volume control + - Fan control and monitoring: fan speed, fan enable/disable + - WAN enable and disable + - UWB enable and disable + +A compatibility table by model and feature is maintained on the web +site, http://ibm-acpi.sf.net/. I appreciate any success or failure +reports, especially if they add to or correct the compatibility table. +Please include the following information in your report: + + - ThinkPad model name + - a copy of your ACPI tables, using the "acpidump" utility + - a copy of the output of dmidecode, with serial numbers + and UUIDs masked off + - which driver features work and which don't + - the observed behavior of non-working features + +Any other comments or patches are also more than welcome. + + +Installation +------------ + +If you are compiling this driver as included in the Linux kernel +sources, look for the CONFIG_THINKPAD_ACPI Kconfig option. +It is located on the menu path: "Device Drivers" -> "X86 Platform +Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras". + + +Features +-------- + +The driver exports two different interfaces to userspace, which can be +used to access the features it provides. One is a legacy procfs-based +interface, which will be removed at some time in the future. The other +is a new sysfs-based interface which is not complete yet. + +The procfs interface creates the /proc/acpi/ibm directory. There is a +file under that directory for each feature it supports. The procfs +interface is mostly frozen, and will change very little if at all: it +will not be extended to add any new functionality in the driver, instead +all new functionality will be implemented on the sysfs interface. + +The sysfs interface tries to blend in the generic Linux sysfs subsystems +and classes as much as possible. Since some of these subsystems are not +yet ready or stabilized, it is expected that this interface will change, +and any and all userspace programs must deal with it. + + +Notes about the sysfs interface +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Unlike what was done with the procfs interface, correctness when talking +to the sysfs interfaces will be enforced, as will correctness in the +thinkpad-acpi's implementation of sysfs interfaces. + +Also, any bugs in the thinkpad-acpi sysfs driver code or in the +thinkpad-acpi's implementation of the sysfs interfaces will be fixed for +maximum correctness, even if that means changing an interface in +non-compatible ways. As these interfaces mature both in the kernel and +in thinkpad-acpi, such changes should become quite rare. + +Applications interfacing to the thinkpad-acpi sysfs interfaces must +follow all sysfs guidelines and correctly process all errors (the sysfs +interface makes extensive use of errors). File descriptors and open / +close operations to the sysfs inodes must also be properly implemented. + +The version of thinkpad-acpi's sysfs interface is exported by the driver +as a driver attribute (see below). + +Sysfs driver attributes are on the driver's sysfs attribute space, +for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and +/sys/bus/platform/drivers/thinkpad_hwmon/ + +Sysfs device attributes are on the thinkpad_acpi device sysfs attribute +space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/. + +Sysfs device attributes for the sensors and fan are on the +thinkpad_hwmon device's sysfs attribute space, but you should locate it +looking for a hwmon device with the name attribute of "thinkpad", or +better yet, through libsensors. For 4.14+ sysfs attributes were moved to the +hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or +/sys/class/hwmon/hwmon?). + +Driver version +-------------- + +procfs: /proc/acpi/ibm/driver + +sysfs driver attribute: version + +The driver name and version. No commands can be written to this file. + + +Sysfs interface version +----------------------- + +sysfs driver attribute: interface_version + +Version of the thinkpad-acpi sysfs interface, as an unsigned long +(output in hex format: 0xAAAABBCC), where: + + AAAA + - major revision + BB + - minor revision + CC + - bugfix revision + +The sysfs interface version changelog for the driver can be found at the +end of this document. Changes to the sysfs interface done by the kernel +subsystems are not documented here, nor are they tracked by this +attribute. + +Changes to the thinkpad-acpi sysfs interface are only considered +non-experimental when they are submitted to Linux mainline, at which +point the changes in this interface are documented and interface_version +may be updated. If you are using any thinkpad-acpi features not yet +sent to mainline for merging, you do so on your own risk: these features +may disappear, or be implemented in a different and incompatible way by +the time they are merged in Linux mainline. + +Changes that are backwards-compatible by nature (e.g. the addition of +attributes that do not change the way the other attributes work) do not +always warrant an update of interface_version. Therefore, one must +expect that an attribute might not be there, and deal with it properly +(an attribute not being there *is* a valid way to make it clear that a +feature is not available in sysfs). + + +Hot keys +-------- + +procfs: /proc/acpi/ibm/hotkey + +sysfs device attribute: hotkey_* + +In a ThinkPad, the ACPI HKEY handler is responsible for communicating +some important events and also keyboard hot key presses to the operating +system. Enabling the hotkey functionality of thinkpad-acpi signals the +firmware that such a driver is present, and modifies how the ThinkPad +firmware will behave in many situations. + +The driver enables the HKEY ("hot key") event reporting automatically +when loaded, and disables it when it is removed. + +The driver will report HKEY events in the following format:: + + ibm/hotkey HKEY 00000080 0000xxxx + +Some of these events refer to hot key presses, but not all of them. + +The driver will generate events over the input layer for hot keys and +radio switches, and over the ACPI netlink layer for other events. The +input layer support accepts the standard IOCTLs to remap the keycodes +assigned to each hot key. + +The hot key bit mask allows some control over which hot keys generate +events. If a key is "masked" (bit set to 0 in the mask), the firmware +will handle it. If it is "unmasked", it signals the firmware that +thinkpad-acpi would prefer to handle it, if the firmware would be so +kind to allow it (and it often doesn't!). + +Not all bits in the mask can be modified. Not all bits that can be +modified do anything. Not all hot keys can be individually controlled +by the mask. Some models do not support the mask at all. The behaviour +of the mask is, therefore, highly dependent on the ThinkPad model. + +The driver will filter out any unmasked hotkeys, so even if the firmware +doesn't allow disabling an specific hotkey, the driver will not report +events for unmasked hotkeys. + +Note that unmasking some keys prevents their default behavior. For +example, if Fn+F5 is unmasked, that key will no longer enable/disable +Bluetooth by itself in firmware. + +Note also that not all Fn key combinations are supported through ACPI +depending on the ThinkPad model and firmware version. On those +ThinkPads, it is still possible to support some extra hotkeys by +polling the "CMOS NVRAM" at least 10 times per second. The driver +attempts to enables this functionality automatically when required. + +procfs notes +^^^^^^^^^^^^ + +The following commands can be written to the /proc/acpi/ibm/hotkey file:: + + echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys + echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys + ... any other 8-hex-digit mask ... + echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask + +The following commands have been deprecated and will cause the kernel +to log a warning:: + + echo enable > /proc/acpi/ibm/hotkey -- does nothing + echo disable > /proc/acpi/ibm/hotkey -- returns an error + +The procfs interface does not support NVRAM polling control. So as to +maintain maximum bug-to-bug compatibility, it does not report any masks, +nor does it allow one to manipulate the hot key mask when the firmware +does not support masks at all, even if NVRAM polling is in use. + +sysfs notes +^^^^^^^^^^^ + + hotkey_bios_enabled: + DEPRECATED, WILL BE REMOVED SOON. + + Returns 0. + + hotkey_bios_mask: + DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE. + + Returns the hot keys mask when thinkpad-acpi was loaded. + Upon module unload, the hot keys mask will be restored + to this value. This is always 0x80c, because those are + the hotkeys that were supported by ancient firmware + without mask support. + + hotkey_enable: + DEPRECATED, WILL BE REMOVED SOON. + + 0: returns -EPERM + 1: does nothing + + hotkey_mask: + bit mask to enable reporting (and depending on + the firmware, ACPI event generation) for each hot key + (see above). Returns the current status of the hot keys + mask, and allows one to modify it. + + hotkey_all_mask: + bit mask that should enable event reporting for all + supported hot keys, when echoed to hotkey_mask above. + Unless you know which events need to be handled + passively (because the firmware *will* handle them + anyway), do *not* use hotkey_all_mask. Use + hotkey_recommended_mask, instead. You have been warned. + + hotkey_recommended_mask: + bit mask that should enable event reporting for all + supported hot keys, except those which are always + handled by the firmware anyway. Echo it to + hotkey_mask above, to use. This is the default mask + used by the driver. + + hotkey_source_mask: + bit mask that selects which hot keys will the driver + poll the NVRAM for. This is auto-detected by the driver + based on the capabilities reported by the ACPI firmware, + but it can be overridden at runtime. + + Hot keys whose bits are set in hotkey_source_mask are + polled for in NVRAM, and reported as hotkey events if + enabled in hotkey_mask. Only a few hot keys are + available through CMOS NVRAM polling. + + Warning: when in NVRAM mode, the volume up/down/mute + keys are synthesized according to changes in the mixer, + which uses a single volume up or volume down hotkey + press to unmute, as per the ThinkPad volume mixer user + interface. When in ACPI event mode, volume up/down/mute + events are reported by the firmware and can behave + differently (and that behaviour changes with firmware + version -- not just with firmware models -- as well as + OSI(Linux) state). + + hotkey_poll_freq: + frequency in Hz for hot key polling. It must be between + 0 and 25 Hz. Polling is only carried out when strictly + needed. + + Setting hotkey_poll_freq to zero disables polling, and + will cause hot key presses that require NVRAM polling + to never be reported. + + Setting hotkey_poll_freq too low may cause repeated + pressings of the same hot key to be misreported as a + single key press, or to not even be detected at all. + The recommended polling frequency is 10Hz. + + hotkey_radio_sw: + If the ThinkPad has a hardware radio switch, this + attribute will read 0 if the switch is in the "radios + disabled" position, and 1 if the switch is in the + "radios enabled" position. + + This attribute has poll()/select() support. + + hotkey_tablet_mode: + If the ThinkPad has tablet capabilities, this attribute + will read 0 if the ThinkPad is in normal mode, and + 1 if the ThinkPad is in tablet mode. + + This attribute has poll()/select() support. + + wakeup_reason: + Set to 1 if the system is waking up because the user + requested a bay ejection. Set to 2 if the system is + waking up because the user requested the system to + undock. Set to zero for normal wake-ups or wake-ups + due to unknown reasons. + + This attribute has poll()/select() support. + + wakeup_hotunplug_complete: + Set to 1 if the system was waken up because of an + undock or bay ejection request, and that request + was successfully completed. At this point, it might + be useful to send the system back to sleep, at the + user's choice. Refer to HKEY events 0x4003 and + 0x3003, below. + + This attribute has poll()/select() support. + +input layer notes +^^^^^^^^^^^^^^^^^ + +A Hot key is mapped to a single input layer EV_KEY event, possibly +followed by an EV_MSC MSC_SCAN event that shall contain that key's scan +code. An EV_SYN event will always be generated to mark the end of the +event block. + +Do not use the EV_MSC MSC_SCAN events to process keys. They are to be +used as a helper to remap keys, only. They are particularly useful when +remapping KEY_UNKNOWN keys. + +The events are available in an input device, with the following id: + + ============== ============================== + Bus BUS_HOST + vendor 0x1014 (PCI_VENDOR_ID_IBM) or + 0x17aa (PCI_VENDOR_ID_LENOVO) + product 0x5054 ("TP") + version 0x4101 + ============== ============================== + +The version will have its LSB incremented if the keymap changes in a +backwards-compatible way. The MSB shall always be 0x41 for this input +device. If the MSB is not 0x41, do not use the device as described in +this section, as it is either something else (e.g. another input device +exported by a thinkpad driver, such as HDAPS) or its functionality has +been changed in a non-backwards compatible way. + +Adding other event types for other functionalities shall be considered a +backwards-compatible change for this input device. + +Thinkpad-acpi Hot Key event map (version 0x4101): + +======= ======= ============== ============================================== +ACPI Scan +event code Key Notes +======= ======= ============== ============================================== +0x1001 0x00 FN+F1 - + +0x1002 0x01 FN+F2 IBM: battery (rare) + Lenovo: Screen lock + +0x1003 0x02 FN+F3 Many IBM models always report + this hot key, even with hot keys + disabled or with Fn+F3 masked + off + IBM: screen lock, often turns + off the ThinkLight as side-effect + Lenovo: battery + +0x1004 0x03 FN+F4 Sleep button (ACPI sleep button + semantics, i.e. sleep-to-RAM). + It always generates some kind + of event, either the hot key + event or an ACPI sleep button + event. The firmware may + refuse to generate further FN+F4 + key presses until a S3 or S4 ACPI + sleep cycle is performed or some + time passes. + +0x1005 0x04 FN+F5 Radio. Enables/disables + the internal Bluetooth hardware + and W-WAN card if left in control + of the firmware. Does not affect + the WLAN card. + Should be used to turn on/off all + radios (Bluetooth+W-WAN+WLAN), + really. + +0x1006 0x05 FN+F6 - + +0x1007 0x06 FN+F7 Video output cycle. + Do you feel lucky today? + +0x1008 0x07 FN+F8 IBM: toggle screen expand + Lenovo: configure UltraNav, + or toggle screen expand + +0x1009 0x08 FN+F9 - + +... ... ... ... + +0x100B 0x0A FN+F11 - + +0x100C 0x0B FN+F12 Sleep to disk. You are always + supposed to handle it yourself, + either through the ACPI event, + or through a hotkey event. + The firmware may refuse to + generate further FN+F12 key + press events until a S3 or S4 + ACPI sleep cycle is performed, + or some time passes. + +0x100D 0x0C FN+BACKSPACE - +0x100E 0x0D FN+INSERT - +0x100F 0x0E FN+DELETE - + +0x1010 0x0F FN+HOME Brightness up. This key is + always handled by the firmware + in IBM ThinkPads, even when + unmasked. Just leave it alone. + For Lenovo ThinkPads with a new + BIOS, it has to be handled either + by the ACPI OSI, or by userspace. + The driver does the right thing, + never mess with this. +0x1011 0x10 FN+END Brightness down. See brightness + up for details. + +0x1012 0x11 FN+PGUP ThinkLight toggle. This key is + always handled by the firmware, + even when unmasked. + +0x1013 0x12 FN+PGDOWN - + +0x1014 0x13 FN+SPACE Zoom key + +0x1015 0x14 VOLUME UP Internal mixer volume up. This + key is always handled by the + firmware, even when unmasked. + NOTE: Lenovo seems to be changing + this. +0x1016 0x15 VOLUME DOWN Internal mixer volume up. This + key is always handled by the + firmware, even when unmasked. + NOTE: Lenovo seems to be changing + this. +0x1017 0x16 MUTE Mute internal mixer. This + key is always handled by the + firmware, even when unmasked. + +0x1018 0x17 THINKPAD ThinkPad/Access IBM/Lenovo key + +0x1019 0x18 unknown + +... ... ... + +0x1020 0x1F unknown +======= ======= ============== ============================================== + +The ThinkPad firmware does not allow one to differentiate when most hot +keys are pressed or released (either that, or we don't know how to, yet). +For these keys, the driver generates a set of events for a key press and +immediately issues the same set of events for a key release. It is +unknown by the driver if the ThinkPad firmware triggered these events on +hot key press or release, but the firmware will do it for either one, not +both. + +If a key is mapped to KEY_RESERVED, it generates no input events at all. +If a key is mapped to KEY_UNKNOWN, it generates an input event that +includes an scan code. If a key is mapped to anything else, it will +generate input device EV_KEY events. + +In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW +events for switches: + +============== ============================================== +SW_RFKILL_ALL T60 and later hardware rfkill rocker switch +SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A +============== ============================================== + +Non hotkey ACPI HKEY event map +------------------------------ + +Events that are never propagated by the driver: + +====== ================================================== +0x2304 System is waking up from suspend to undock +0x2305 System is waking up from suspend to eject bay +0x2404 System is waking up from hibernation to undock +0x2405 System is waking up from hibernation to eject bay +0x5001 Lid closed +0x5002 Lid opened +0x5009 Tablet swivel: switched to tablet mode +0x500A Tablet swivel: switched to normal mode +0x5010 Brightness level changed/control event +0x6000 KEYBOARD: Numlock key pressed +0x6005 KEYBOARD: Fn key pressed (TO BE VERIFIED) +0x7000 Radio Switch may have changed state +====== ================================================== + + +Events that are propagated by the driver to userspace: + +====== ===================================================== +0x2313 ALARM: System is waking up from suspend because + the battery is nearly empty +0x2413 ALARM: System is waking up from hibernation because + the battery is nearly empty +0x3003 Bay ejection (see 0x2x05) complete, can sleep again +0x3006 Bay hotplug request (hint to power up SATA link when + the optical drive tray is ejected) +0x4003 Undocked (see 0x2x04), can sleep again +0x4010 Docked into hotplug port replicator (non-ACPI dock) +0x4011 Undocked from hotplug port replicator (non-ACPI dock) +0x500B Tablet pen inserted into its storage bay +0x500C Tablet pen removed from its storage bay +0x6011 ALARM: battery is too hot +0x6012 ALARM: battery is extremely hot +0x6021 ALARM: a sensor is too hot +0x6022 ALARM: a sensor is extremely hot +0x6030 System thermal table changed +0x6032 Thermal Control command set completion (DYTC, Windows) +0x6040 Nvidia Optimus/AC adapter related (TO BE VERIFIED) +0x60C0 X1 Yoga 2016, Tablet mode status changed +0x60F0 Thermal Transformation changed (GMTS, Windows) +====== ===================================================== + +Battery nearly empty alarms are a last resort attempt to get the +operating system to hibernate or shutdown cleanly (0x2313), or shutdown +cleanly (0x2413) before power is lost. They must be acted upon, as the +wake up caused by the firmware will have negated most safety nets... + +When any of the "too hot" alarms happen, according to Lenovo the user +should suspend or hibernate the laptop (and in the case of battery +alarms, unplug the AC adapter) to let it cool down. These alarms do +signal that something is wrong, they should never happen on normal +operating conditions. + +The "extremely hot" alarms are emergencies. According to Lenovo, the +operating system is to force either an immediate suspend or hibernate +cycle, or a system shutdown. Obviously, something is very wrong if this +happens. + + +Brightness hotkey notes +^^^^^^^^^^^^^^^^^^^^^^^ + +Don't mess with the brightness hotkeys in a Thinkpad. If you want +notifications for OSD, use the sysfs backlight class event support. + +The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events +automatically for the cases were userspace has to do something to +implement brightness changes. When you override these events, you will +either fail to handle properly the ThinkPads that require explicit +action to change backlight brightness, or the ThinkPads that require +that no action be taken to work properly. + + +Bluetooth +--------- + +procfs: /proc/acpi/ibm/bluetooth + +sysfs device attribute: bluetooth_enable (deprecated) + +sysfs rfkill class: switch "tpacpi_bluetooth_sw" + +This feature shows the presence and current state of a ThinkPad +Bluetooth device in the internal ThinkPad CDC slot. + +If the ThinkPad supports it, the Bluetooth state is stored in NVRAM, +so it is kept across reboots and power-off. + +Procfs notes +^^^^^^^^^^^^ + +If Bluetooth is installed, the following commands can be used:: + + echo enable > /proc/acpi/ibm/bluetooth + echo disable > /proc/acpi/ibm/bluetooth + +Sysfs notes +^^^^^^^^^^^ + + If the Bluetooth CDC card is installed, it can be enabled / + disabled through the "bluetooth_enable" thinkpad-acpi device + attribute, and its current status can also be queried. + + enable: + + - 0: disables Bluetooth / Bluetooth is disabled + - 1: enables Bluetooth / Bluetooth is enabled. + + Note: this interface has been superseded by the generic rfkill + class. It has been deprecated, and it will be removed in year + 2010. + + rfkill controller switch "tpacpi_bluetooth_sw": refer to + Documentation/rfkill.txt for details. + + +Video output control -- /proc/acpi/ibm/video +-------------------------------------------- + +This feature allows control over the devices used for video output - +LCD, CRT or DVI (if available). The following commands are available:: + + echo lcd_enable > /proc/acpi/ibm/video + echo lcd_disable > /proc/acpi/ibm/video + echo crt_enable > /proc/acpi/ibm/video + echo crt_disable > /proc/acpi/ibm/video + echo dvi_enable > /proc/acpi/ibm/video + echo dvi_disable > /proc/acpi/ibm/video + echo auto_enable > /proc/acpi/ibm/video + echo auto_disable > /proc/acpi/ibm/video + echo expand_toggle > /proc/acpi/ibm/video + echo video_switch > /proc/acpi/ibm/video + +NOTE: + Access to this feature is restricted to processes owning the + CAP_SYS_ADMIN capability for safety reasons, as it can interact badly + enough with some versions of X.org to crash it. + +Each video output device can be enabled or disabled individually. +Reading /proc/acpi/ibm/video shows the status of each device. + +Automatic video switching can be enabled or disabled. When automatic +video switching is enabled, certain events (e.g. opening the lid, +docking or undocking) cause the video output device to change +automatically. While this can be useful, it also causes flickering +and, on the X40, video corruption. By disabling automatic switching, +the flickering or video corruption can be avoided. + +The video_switch command cycles through the available video outputs +(it simulates the behavior of Fn-F7). + +Video expansion can be toggled through this feature. This controls +whether the display is expanded to fill the entire LCD screen when a +mode with less than full resolution is used. Note that the current +video expansion status cannot be determined through this feature. + +Note that on many models (particularly those using Radeon graphics +chips) the X driver configures the video card in a way which prevents +Fn-F7 from working. This also disables the video output switching +features of this driver, as it uses the same ACPI methods as +Fn-F7. Video switching on the console should still work. + +UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000 + + +ThinkLight control +------------------ + +procfs: /proc/acpi/ibm/light + +sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED + +procfs notes +^^^^^^^^^^^^ + +The ThinkLight status can be read and set through the procfs interface. A +few models which do not make the status available will show the ThinkLight +status as "unknown". The available commands are:: + + echo on > /proc/acpi/ibm/light + echo off > /proc/acpi/ibm/light + +sysfs notes +^^^^^^^^^^^ + +The ThinkLight sysfs interface is documented by the LED class +documentation, in Documentation/leds/leds-class.rst. The ThinkLight LED name +is "tpacpi::thinklight". + +Due to limitations in the sysfs LED class, if the status of the ThinkLight +cannot be read or if it is unknown, thinkpad-acpi will report it as "off". +It is impossible to know if the status returned through sysfs is valid. + + +CMOS/UCMS control +----------------- + +procfs: /proc/acpi/ibm/cmos + +sysfs device attribute: cmos_command + +This feature is mostly used internally by the ACPI firmware to keep the legacy +CMOS NVRAM bits in sync with the current machine state, and to record this +state so that the ThinkPad will retain such settings across reboots. + +Some of these commands actually perform actions in some ThinkPad models, but +this is expected to disappear more and more in newer models. As an example, in +a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for +real, but commands 0 to 2 don't control the mixer anymore (they have been +phased out) and just update the NVRAM. + +The range of valid cmos command numbers is 0 to 21, but not all have an +effect and the behavior varies from model to model. Here is the behavior +on the X40 (tpb is the ThinkPad Buttons utility): + + - 0 - Related to "Volume down" key press + - 1 - Related to "Volume up" key press + - 2 - Related to "Mute on" key press + - 3 - Related to "Access IBM" key press + - 4 - Related to "LCD brightness up" key press + - 5 - Related to "LCD brightness down" key press + - 11 - Related to "toggle screen expansion" key press/function + - 12 - Related to "ThinkLight on" + - 13 - Related to "ThinkLight off" + - 14 - Related to "ThinkLight" key press (toggle ThinkLight) + +The cmos command interface is prone to firmware split-brain problems, as +in newer ThinkPads it is just a compatibility layer. Do not use it, it is +exported just as a debug tool. + + +LED control +----------- + +procfs: /proc/acpi/ibm/led +sysfs attributes: as per LED class, see below for names + +Some of the LED indicators can be controlled through this feature. On +some older ThinkPad models, it is possible to query the status of the +LED indicators as well. Newer ThinkPads cannot query the real status +of the LED indicators. + +Because misuse of the LEDs could induce an unaware user to perform +dangerous actions (like undocking or ejecting a bay device while the +buses are still active), or mask an important alarm (such as a nearly +empty battery, or a broken battery), access to most LEDs is +restricted. + +Unrestricted access to all LEDs requires that thinkpad-acpi be +compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled. +Distributions must never enable this option. Individual users that +are aware of the consequences are welcome to enabling it. + +Audio mute and microphone mute LEDs are supported, but currently not +visible to userspace. They are used by the snd-hda-intel audio driver. + +procfs notes +^^^^^^^^^^^^ + +The available commands are:: + + echo ' on' >/proc/acpi/ibm/led + echo ' off' >/proc/acpi/ibm/led + echo ' blink' >/proc/acpi/ibm/led + +The range is 0 to 15. The set of LEDs that can be +controlled varies from model to model. Here is the common ThinkPad +mapping: + + - 0 - power + - 1 - battery (orange) + - 2 - battery (green) + - 3 - UltraBase/dock + - 4 - UltraBay + - 5 - UltraBase battery slot + - 6 - (unknown) + - 7 - standby + - 8 - dock status 1 + - 9 - dock status 2 + - 10, 11 - (unknown) + - 12 - thinkvantage + - 13, 14, 15 - (unknown) + +All of the above can be turned on and off and can be made to blink. + +sysfs notes +^^^^^^^^^^^ + +The ThinkPad LED sysfs interface is described in detail by the LED class +documentation, in Documentation/leds/leds-class.rst. + +The LEDs are named (in LED ID order, from 0 to 12): +"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt", +"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt", +"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1", +"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3", +"tpacpi::thinkvantage". + +Due to limitations in the sysfs LED class, if the status of the LED +indicators cannot be read due to an error, thinkpad-acpi will report it as +a brightness of zero (same as LED off). + +If the thinkpad firmware doesn't support reading the current status, +trying to read the current LED brightness will just return whatever +brightness was last written to that attribute. + +These LEDs can blink using hardware acceleration. To request that a +ThinkPad indicator LED should blink in hardware accelerated mode, use the +"timer" trigger, and leave the delay_on and delay_off parameters set to +zero (to request hardware acceleration autodetection). + +LEDs that are known not to exist in a given ThinkPad model are not +made available through the sysfs interface. If you have a dock and you +notice there are LEDs listed for your ThinkPad that do not exist (and +are not in the dock), or if you notice that there are missing LEDs, +a report to ibm-acpi-devel@lists.sourceforge.net is appreciated. + + +ACPI sounds -- /proc/acpi/ibm/beep +---------------------------------- + +The BEEP method is used internally by the ACPI firmware to provide +audible alerts in various situations. This feature allows the same +sounds to be triggered manually. + +The commands are non-negative integer numbers:: + + echo >/proc/acpi/ibm/beep + +The valid range is 0 to 17. Not all numbers trigger sounds +and the sounds vary from model to model. Here is the behavior on the +X40: + + - 0 - stop a sound in progress (but use 17 to stop 16) + - 2 - two beeps, pause, third beep ("low battery") + - 3 - single beep + - 4 - high, followed by low-pitched beep ("unable") + - 5 - single beep + - 6 - very high, followed by high-pitched beep ("AC/DC") + - 7 - high-pitched beep + - 9 - three short beeps + - 10 - very long beep + - 12 - low-pitched beep + - 15 - three high-pitched beeps repeating constantly, stop with 0 + - 16 - one medium-pitched beep repeating constantly, stop with 17 + - 17 - stop 16 + + +Temperature sensors +------------------- + +procfs: /proc/acpi/ibm/thermal + +sysfs device attributes: (hwmon "thinkpad") temp*_input + +Most ThinkPads include six or more separate temperature sensors but only +expose the CPU temperature through the standard ACPI methods. This +feature shows readings from up to eight different sensors on older +ThinkPads, and up to sixteen different sensors on newer ThinkPads. + +For example, on the X40, a typical output may be: + +temperatures: + 42 42 45 41 36 -128 33 -128 + +On the T43/p, a typical output may be: + +temperatures: + 48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128 + +The mapping of thermal sensors to physical locations varies depending on +system-board model (and thus, on ThinkPad model). + +http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that +tries to track down these locations for various models. + +Most (newer?) models seem to follow this pattern: + +- 1: CPU +- 2: (depends on model) +- 3: (depends on model) +- 4: GPU +- 5: Main battery: main sensor +- 6: Bay battery: main sensor +- 7: Main battery: secondary sensor +- 8: Bay battery: secondary sensor +- 9-15: (depends on model) + +For the R51 (source: Thomas Gruber): + +- 2: Mini-PCI +- 3: Internal HDD + +For the T43, T43/p (source: Shmidoax/Thinkwiki.org) +http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p + +- 2: System board, left side (near PCMCIA slot), reported as HDAPS temp +- 3: PCMCIA slot +- 9: MCH (northbridge) to DRAM Bus +- 10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI + card, under touchpad +- 11: Power regulator, underside of system board, below F2 key + +The A31 has a very atypical layout for the thermal sensors +(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31) + +- 1: CPU +- 2: Main Battery: main sensor +- 3: Power Converter +- 4: Bay Battery: main sensor +- 5: MCH (northbridge) +- 6: PCMCIA/ambient +- 7: Main Battery: secondary sensor +- 8: Bay Battery: secondary sensor + + +Procfs notes +^^^^^^^^^^^^ + + Readings from sensors that are not available return -128. + No commands can be written to this file. + +Sysfs notes +^^^^^^^^^^^ + + Sensors that are not available return the ENXIO error. This + status may change at runtime, as there are hotplug thermal + sensors, like those inside the batteries and docks. + + thinkpad-acpi thermal sensors are reported through the hwmon + subsystem, and follow all of the hwmon guidelines at + Documentation/hwmon. + +EXPERIMENTAL: Embedded controller register dump +----------------------------------------------- + +This feature is not included in the thinkpad driver anymore. +Instead the EC can be accessed through /sys/kernel/debug/ec with +a userspace tool which can be found here: +ftp://ftp.suse.com/pub/people/trenn/sources/ec + +Use it to determine the register holding the fan +speed on some models. To do that, do the following: + + - make sure the battery is fully charged + - make sure the fan is running + - use above mentioned tool to read out the EC + +Often fan and temperature values vary between +readings. Since temperatures don't change vary fast, you can take +several quick dumps to eliminate them. + +You can use a similar method to figure out the meaning of other +embedded controller registers - e.g. make sure nothing else changes +except the charging or discharging battery to determine which +registers contain the current battery capacity, etc. If you experiment +with this, do send me your results (including some complete dumps with +a description of the conditions when they were taken.) + + +LCD brightness control +---------------------- + +procfs: /proc/acpi/ibm/brightness + +sysfs backlight device "thinkpad_screen" + +This feature allows software control of the LCD brightness on ThinkPad +models which don't have a hardware brightness slider. + +It has some limitations: the LCD backlight cannot be actually turned +on or off by this interface, it just controls the backlight brightness +level. + +On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control +has eight brightness levels, ranging from 0 to 7. Some of the levels +may not be distinct. Later Lenovo models that implement the ACPI +display backlight brightness control methods have 16 levels, ranging +from 0 to 15. + +For IBM ThinkPads, there are two interfaces to the firmware for direct +brightness control, EC and UCMS (or CMOS). To select which one should be +used, use the brightness_mode module parameter: brightness_mode=1 selects +EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC +mode with NVRAM backing (so that brightness changes are remembered across +shutdown/reboot). + +The driver tries to select which interface to use from a table of +defaults for each ThinkPad model. If it makes a wrong choice, please +report this as a bug, so that we can fix it. + +Lenovo ThinkPads only support brightness_mode=2 (UCMS). + +When display backlight brightness controls are available through the +standard ACPI interface, it is best to use it instead of this direct +ThinkPad-specific interface. The driver will disable its native +backlight brightness control interface if it detects that the standard +ACPI interface is available in the ThinkPad. + +If you want to use the thinkpad-acpi backlight brightness control +instead of the generic ACPI video backlight brightness control for some +reason, you should use the acpi_backlight=vendor kernel parameter. + +The brightness_enable module parameter can be used to control whether +the LCD brightness control feature will be enabled when available. +brightness_enable=0 forces it to be disabled. brightness_enable=1 +forces it to be enabled when available, even if the standard ACPI +interface is also available. + +Procfs notes +^^^^^^^^^^^^ + +The available commands are:: + + echo up >/proc/acpi/ibm/brightness + echo down >/proc/acpi/ibm/brightness + echo 'level ' >/proc/acpi/ibm/brightness + +Sysfs notes +^^^^^^^^^^^ + +The interface is implemented through the backlight sysfs class, which is +poorly documented at this time. + +Locate the thinkpad_screen device under /sys/class/backlight, and inside +it there will be the following attributes: + + max_brightness: + Reads the maximum brightness the hardware can be set to. + The minimum is always zero. + + actual_brightness: + Reads what brightness the screen is set to at this instant. + + brightness: + Writes request the driver to change brightness to the + given value. Reads will tell you what brightness the + driver is trying to set the display to when "power" is set + to zero and the display has not been dimmed by a kernel + power management event. + + power: + power management mode, where 0 is "display on", and 1 to 3 + will dim the display backlight to brightness level 0 + because thinkpad-acpi cannot really turn the backlight + off. Kernel power management events can temporarily + increase the current power management level, i.e. they can + dim the display. + + +WARNING: + + Whatever you do, do NOT ever call thinkpad-acpi backlight-level change + interface and the ACPI-based backlight level change interface + (available on newer BIOSes, and driven by the Linux ACPI video driver) + at the same time. The two will interact in bad ways, do funny things, + and maybe reduce the life of the backlight lamps by needlessly kicking + its level up and down at every change. + + +Volume control (Console Audio control) +-------------------------------------- + +procfs: /proc/acpi/ibm/volume + +ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC" + +NOTE: by default, the volume control interface operates in read-only +mode, as it is supposed to be used for on-screen-display purposes. +The read/write mode can be enabled through the use of the +"volume_control=1" module parameter. + +NOTE: distros are urged to not enable volume_control by default, this +should be done by the local admin only. The ThinkPad UI is for the +console audio control to be done through the volume keys only, and for +the desktop environment to just provide on-screen-display feedback. +Software volume control should be done only in the main AC97/HDA +mixer. + + +About the ThinkPad Console Audio control +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +ThinkPads have a built-in amplifier and muting circuit that drives the +console headphone and speakers. This circuit is after the main AC97 +or HDA mixer in the audio path, and under exclusive control of the +firmware. + +ThinkPads have three special hotkeys to interact with the console +audio control: volume up, volume down and mute. + +It is worth noting that the normal way the mute function works (on +ThinkPads that do not have a "mute LED") is: + +1. Press mute to mute. It will *always* mute, you can press it as + many times as you want, and the sound will remain mute. + +2. Press either volume key to unmute the ThinkPad (it will _not_ + change the volume, it will just unmute). + +This is a very superior design when compared to the cheap software-only +mute-toggle solution found on normal consumer laptops: you can be +absolutely sure the ThinkPad will not make noise if you press the mute +button, no matter the previous state. + +The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain +amplifiers driving the speakers and headphone output, and the firmware +also handles volume control for the headphone and speakers on these +ThinkPads without any help from the operating system (this volume +control stage exists after the main AC97 or HDA mixer in the audio +path). + +The newer Lenovo models only have firmware mute control, and depend on +the main HDA mixer to do volume control (which is done by the operating +system). In this case, the volume keys are filtered out for unmute +key press (there are some firmware bugs in this area) and delivered as +normal key presses to the operating system (thinkpad-acpi is not +involved). + + +The ThinkPad-ACPI volume control +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The preferred way to interact with the Console Audio control is the +ALSA interface. + +The legacy procfs interface allows one to read the current state, +and if volume control is enabled, accepts the following commands:: + + echo up >/proc/acpi/ibm/volume + echo down >/proc/acpi/ibm/volume + echo mute >/proc/acpi/ibm/volume + echo unmute >/proc/acpi/ibm/volume + echo 'level ' >/proc/acpi/ibm/volume + +The number range is 0 to 14 although not all of them may be +distinct. To unmute the volume after the mute command, use either the +up or down command (the level command will not unmute the volume), or +the unmute command. + +You can use the volume_capabilities parameter to tell the driver +whether your thinkpad has volume control or mute-only control: +volume_capabilities=1 for mixers with mute and volume control, +volume_capabilities=2 for mixers with only mute control. + +If the driver misdetects the capabilities for your ThinkPad model, +please report this to ibm-acpi-devel@lists.sourceforge.net, so that we +can update the driver. + +There are two strategies for volume control. To select which one +should be used, use the volume_mode module parameter: volume_mode=1 +selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing +(so that volume/mute changes are remembered across shutdown/reboot). + +The driver will operate in volume_mode=3 by default. If that does not +work well on your ThinkPad model, please report this to +ibm-acpi-devel@lists.sourceforge.net. + +The driver supports the standard ALSA module parameters. If the ALSA +mixer is disabled, the driver will disable all volume functionality. + + +Fan control and monitoring: fan speed, fan enable/disable +--------------------------------------------------------- + +procfs: /proc/acpi/ibm/fan + +sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, pwm1_enable, fan2_input + +sysfs hwmon driver attributes: fan_watchdog + +NOTE NOTE NOTE: + fan control operations are disabled by default for + safety reasons. To enable them, the module parameter "fan_control=1" + must be given to thinkpad-acpi. + +This feature attempts to show the current fan speed, control mode and +other fan data that might be available. The speed is read directly +from the hardware registers of the embedded controller. This is known +to work on later R, T, X and Z series ThinkPads but may show a bogus +value on other models. + +Some Lenovo ThinkPads support a secondary fan. This fan cannot be +controlled separately, it shares the main fan control. + +Fan levels +^^^^^^^^^^ + +Most ThinkPad fans work in "levels" at the firmware interface. Level 0 +stops the fan. The higher the level, the higher the fan speed, although +adjacent levels often map to the same fan speed. 7 is the highest +level, where the fan reaches the maximum recommended speed. + +Level "auto" means the EC changes the fan level according to some +internal algorithm, usually based on readings from the thermal sensors. + +There is also a "full-speed" level, also known as "disengaged" level. +In this level, the EC disables the speed-locked closed-loop fan control, +and drives the fan as fast as it can go, which might exceed hardware +limits, so use this level with caution. + +The fan usually ramps up or down slowly from one speed to another, and +it is normal for the EC to take several seconds to react to fan +commands. The full-speed level may take up to two minutes to ramp up to +maximum speed, and in some ThinkPads, the tachometer readings go stale +while the EC is transitioning to the full-speed level. + +WARNING WARNING WARNING: do not leave the fan disabled unless you are +monitoring all of the temperature sensor readings and you are ready to +enable it if necessary to avoid overheating. + +An enabled fan in level "auto" may stop spinning if the EC decides the +ThinkPad is cool enough and doesn't need the extra airflow. This is +normal, and the EC will spin the fan up if the various thermal readings +rise too much. + +On the X40, this seems to depend on the CPU and HDD temperatures. +Specifically, the fan is turned on when either the CPU temperature +climbs to 56 degrees or the HDD temperature climbs to 46 degrees. The +fan is turned off when the CPU temperature drops to 49 degrees and the +HDD temperature drops to 41 degrees. These thresholds cannot +currently be controlled. + +The ThinkPad's ACPI DSDT code will reprogram the fan on its own when +certain conditions are met. It will override any fan programming done +through thinkpad-acpi. + +The thinkpad-acpi kernel driver can be programmed to revert the fan +level to a safe setting if userspace does not issue one of the procfs +fan commands: "enable", "disable", "level" or "watchdog", or if there +are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is +set to 1, manual mode) within a configurable amount of time of up to +120 seconds. This functionality is called fan safety watchdog. + +Note that the watchdog timer stops after it enables the fan. It will be +rearmed again automatically (using the same interval) when one of the +above mentioned fan commands is received. The fan watchdog is, +therefore, not suitable to protect against fan mode changes made through +means other than the "enable", "disable", and "level" procfs fan +commands, or the hwmon fan control sysfs interface. + +Procfs notes +^^^^^^^^^^^^ + +The fan may be enabled or disabled with the following commands:: + + echo enable >/proc/acpi/ibm/fan + echo disable >/proc/acpi/ibm/fan + +Placing a fan on level 0 is the same as disabling it. Enabling a fan +will try to place it in a safe level if it is too slow or disabled. + +The fan level can be controlled with the command:: + + echo 'level ' > /proc/acpi/ibm/fan + +Where is an integer from 0 to 7, or one of the words "auto" or +"full-speed" (without the quotes). Not all ThinkPads support the "auto" +and "full-speed" levels. The driver accepts "disengaged" as an alias for +"full-speed", and reports it as "disengaged" for backwards +compatibility. + +On the X31 and X40 (and ONLY on those models), the fan speed can be +controlled to a certain degree. Once the fan is running, it can be +forced to run faster or slower with the following command:: + + echo 'speed ' > /proc/acpi/ibm/fan + +The sustainable range of fan speeds on the X40 appears to be from about +3700 to about 7350. Values outside this range either do not have any +effect or the fan speed eventually settles somewhere in that range. The +fan cannot be stopped or started with this command. This functionality +is incomplete, and not available through the sysfs interface. + +To program the safety watchdog, use the "watchdog" command:: + + echo 'watchdog ' > /proc/acpi/ibm/fan + +If you want to disable the watchdog, use 0 as the interval. + +Sysfs notes +^^^^^^^^^^^ + +The sysfs interface follows the hwmon subsystem guidelines for the most +part, and the exception is the fan safety watchdog. + +Writes to any of the sysfs attributes may return the EINVAL error if +that operation is not supported in a given ThinkPad or if the parameter +is out-of-bounds, and EPERM if it is forbidden. They may also return +EINTR (interrupted system call), and EIO (I/O error while trying to talk +to the firmware). + +Features not yet implemented by the driver return ENOSYS. + +hwmon device attribute pwm1_enable: + - 0: PWM offline (fan is set to full-speed mode) + - 1: Manual PWM control (use pwm1 to set fan level) + - 2: Hardware PWM control (EC "auto" mode) + - 3: reserved (Software PWM control, not implemented yet) + + Modes 0 and 2 are not supported by all ThinkPads, and the + driver is not always able to detect this. If it does know a + mode is unsupported, it will return -EINVAL. + +hwmon device attribute pwm1: + Fan level, scaled from the firmware values of 0-7 to the hwmon + scale of 0-255. 0 means fan stopped, 255 means highest normal + speed (level 7). + + This attribute only commands the fan if pmw1_enable is set to 1 + (manual PWM control). + +hwmon device attribute fan1_input: + Fan tachometer reading, in RPM. May go stale on certain + ThinkPads while the EC transitions the PWM to offline mode, + which can take up to two minutes. May return rubbish on older + ThinkPads. + +hwmon device attribute fan2_input: + Fan tachometer reading, in RPM, for the secondary fan. + Available only on some ThinkPads. If the secondary fan is + not installed, will always read 0. + +hwmon driver attribute fan_watchdog: + Fan safety watchdog timer interval, in seconds. Minimum is + 1 second, maximum is 120 seconds. 0 disables the watchdog. + +To stop the fan: set pwm1 to zero, and pwm1_enable to 1. + +To start the fan in a safe mode: set pwm1_enable to 2. If that fails +with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255 +would be the safest choice, though). + + +WAN +--- + +procfs: /proc/acpi/ibm/wan + +sysfs device attribute: wwan_enable (deprecated) + +sysfs rfkill class: switch "tpacpi_wwan_sw" + +This feature shows the presence and current state of the built-in +Wireless WAN device. + +If the ThinkPad supports it, the WWAN state is stored in NVRAM, +so it is kept across reboots and power-off. + +It was tested on a Lenovo ThinkPad X60. It should probably work on other +ThinkPad models which come with this module installed. + +Procfs notes +^^^^^^^^^^^^ + +If the W-WAN card is installed, the following commands can be used:: + + echo enable > /proc/acpi/ibm/wan + echo disable > /proc/acpi/ibm/wan + +Sysfs notes +^^^^^^^^^^^ + + If the W-WAN card is installed, it can be enabled / + disabled through the "wwan_enable" thinkpad-acpi device + attribute, and its current status can also be queried. + + enable: + - 0: disables WWAN card / WWAN card is disabled + - 1: enables WWAN card / WWAN card is enabled. + + Note: this interface has been superseded by the generic rfkill + class. It has been deprecated, and it will be removed in year + 2010. + + rfkill controller switch "tpacpi_wwan_sw": refer to + Documentation/rfkill.txt for details. + + +EXPERIMENTAL: UWB +----------------- + +This feature is considered EXPERIMENTAL because it has not been extensively +tested and validated in various ThinkPad models yet. The feature may not +work as expected. USE WITH CAUTION! To use this feature, you need to supply +the experimental=1 parameter when loading the module. + +sysfs rfkill class: switch "tpacpi_uwb_sw" + +This feature exports an rfkill controller for the UWB device, if one is +present and enabled in the BIOS. + +Sysfs notes +^^^^^^^^^^^ + + rfkill controller switch "tpacpi_uwb_sw": refer to + Documentation/rfkill.txt for details. + +Adaptive keyboard +----------------- + +sysfs device attribute: adaptive_kbd_mode + +This sysfs attribute controls the keyboard "face" that will be shown on the +Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read +and set. + +- 1 = Home mode +- 2 = Web-browser mode +- 3 = Web-conference mode +- 4 = Function mode +- 5 = Layflat mode + +For more details about which buttons will appear depending on the mode, please +review the laptop's user guide: +http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf + +Multiple Commands, Module Parameters +------------------------------------ + +Multiple commands can be written to the proc files in one shot by +separating them with commas, for example:: + + echo enable,0xffff > /proc/acpi/ibm/hotkey + echo lcd_disable,crt_enable > /proc/acpi/ibm/video + +Commands can also be specified when loading the thinkpad-acpi module, +for example:: + + modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable + + +Enabling debugging output +------------------------- + +The module takes a debug parameter which can be used to selectively +enable various classes of debugging output, for example:: + + modprobe thinkpad_acpi debug=0xffff + +will enable all debugging output classes. It takes a bitmask, so +to enable more than one output class, just add their values. + + ============= ====================================== + Debug bitmask Description + ============= ====================================== + 0x8000 Disclose PID of userspace programs + accessing some functions of the driver + 0x0001 Initialization and probing + 0x0002 Removal + 0x0004 RF Transmitter control (RFKILL) + (bluetooth, WWAN, UWB...) + 0x0008 HKEY event interface, hotkeys + 0x0010 Fan control + 0x0020 Backlight brightness + 0x0040 Audio mixer/volume control + ============= ====================================== + +There is also a kernel build option to enable more debugging +information, which may be necessary to debug driver problems. + +The level of debugging information output by the driver can be changed +at runtime through sysfs, using the driver attribute debug_level. The +attribute takes the same bitmask as the debug module parameter above. + + +Force loading of module +----------------------- + +If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify +the module parameter force_load=1. Regardless of whether this works or +not, please contact ibm-acpi-devel@lists.sourceforge.net with a report. + + +Sysfs interface changelog +^^^^^^^^^^^^^^^^^^^^^^^^^ + +========= =============================================================== +0x000100: Initial sysfs support, as a single platform driver and + device. +0x000200: Hot key support for 32 hot keys, and radio slider switch + support. +0x010000: Hot keys are now handled by default over the input + layer, the radio switch generates input event EV_RADIO, + and the driver enables hot key handling by default in + the firmware. + +0x020000: ABI fix: added a separate hwmon platform device and + driver, which must be located by name (thinkpad) + and the hwmon class for libsensors4 (lm-sensors 3) + compatibility. Moved all hwmon attributes to this + new platform device. + +0x020100: Marker for thinkpad-acpi with hot key NVRAM polling + support. If you must, use it to know you should not + start a userspace NVRAM poller (allows to detect when + NVRAM is compiled out by the user because it is + unneeded/undesired in the first place). +0x020101: Marker for thinkpad-acpi with hot key NVRAM polling + and proper hotkey_mask semantics (version 8 of the + NVRAM polling patch). Some development snapshots of + 0.18 had an earlier version that did strange things + to hotkey_mask. + +0x020200: Add poll()/select() support to the following attributes: + hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason + +0x020300: hotkey enable/disable support removed, attributes + hotkey_bios_enabled and hotkey_enable deprecated and + marked for removal. + +0x020400: Marker for 16 LEDs support. Also, LEDs that are known + to not exist in a given model are not registered with + the LED sysfs class anymore. + +0x020500: Updated hotkey driver, hotkey_mask is always available + and it is always able to disable hot keys. Very old + thinkpads are properly supported. hotkey_bios_mask + is deprecated and marked for removal. + +0x020600: Marker for backlight change event support. + +0x020700: Support for mute-only mixers. + Volume control in read-only mode by default. + Marker for ALSA mixer support. + +0x030000: Thermal and fan sysfs attributes were moved to the hwmon + device instead of being attached to the backing platform + device. +========= =============================================================== diff --git a/Documentation/admin-guide/laptops/toshiba_haps.rst b/Documentation/admin-guide/laptops/toshiba_haps.rst new file mode 100644 index 000000000000..11dfc428c080 --- /dev/null +++ b/Documentation/admin-guide/laptops/toshiba_haps.rst @@ -0,0 +1,87 @@ +==================================== +Toshiba HDD Active Protection Sensor +==================================== + +Kernel driver: toshiba_haps + +Author: Azael Avalos + + +.. 0. Contents + + 1. Description + 2. Interface + 3. Accelerometer axes + 4. Supported devices + 5. Usage + + +1. Description +-------------- + +This driver provides support for the accelerometer found in various Toshiba +laptops, being called "Toshiba HDD Protection - Shock Sensor" officially, +and detects laptops automatically with this device. +On Windows, Toshiba provided software monitors this device and provides +automatic HDD protection (head unload) on sudden moves or harsh vibrations, +however, this driver only provides a notification via a sysfs file to let +userspace tools or daemons act accordingly, as well as providing a sysfs +file to set the desired protection level or sensor sensibility. + + +2. Interface +------------ + +This device comes with 3 methods: + +==== ===================================================================== +_STA Checks existence of the device, returning Zero if the device does not + exists or is not supported. +PTLV Sets the desired protection level. +RSSS Shuts down the HDD protection interface for a few seconds, + then restores normal operation. +==== ===================================================================== + +Note: + The presence of Solid State Drives (SSD) can make this driver to fail loading, + given the fact that such drives have no movable parts, and thus, not requiring + any "protection" as well as failing during the evaluation of the _STA method + found under this device. + + +3. Accelerometer axes +--------------------- + +This device does not report any axes, however, to query the sensor position +a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are +provided to query such information, handled by the kernel module toshiba_acpi +since kernel version 3.15. + + +4. Supported devices +-------------------- + +This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop +with this device is supported, given the fact that they have the presence of +conventional HDD and not only SSD, or a combination of both HDD and SSD. + + +5. Usage +-------- + +The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are: + +================ ============================================================ +protection_level The protection_level is readable and writeable, and + provides a way to let userspace query the current protection + level, as well as set the desired protection level, the + available protection levels are: + + ============ ======= ========== ======== + 0 - Disabled 1 - Low 2 - Medium 3 - High + ============ ======= ========== ======== + +reset_protection The reset_protection entry is writeable only, being "1" + the only parameter it accepts, it is used to trigger + a reset of the protection interface. +================ ============================================================ diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5aceb5cd5ce7..64aeee1009ca 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -108,7 +108,7 @@ block_dump ========== block_dump enables block I/O debugging when set to a nonzero value. More -information on block I/O debugging is in Documentation/laptops/laptop-mode.rst. +information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst. compact_memory @@ -298,7 +298,7 @@ laptop_mode =========== laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/laptops/laptop-mode.rst. +controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst. legacy_va_layout diff --git a/Documentation/laptops/asus-laptop.rst b/Documentation/laptops/asus-laptop.rst deleted file mode 100644 index 95176321a25a..000000000000 --- a/Documentation/laptops/asus-laptop.rst +++ /dev/null @@ -1,271 +0,0 @@ -================== -Asus Laptop Extras -================== - -Version 0.1 - -August 6, 2009 - -Corentin Chary -http://acpi4asus.sf.net/ - - This driver provides support for extra features of ACPI-compatible ASUS laptops. - It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or - VICTOR XP7210 for example). It makes all the extra buttons generate input - events (like keyboards). - - On some models adds support for changing the display brightness and output, - switching the LCD backlight on and off, and most importantly, allows you to - blink those fancy LEDs intended for reporting mail and wireless status. - -This driver supersedes the old asus_acpi driver. - -Requirements ------------- - - Kernel 2.6.X sources, configured for your computer, with ACPI support. - You also need CONFIG_INPUT and CONFIG_ACPI. - -Status ------- - - The features currently supported are the following (see below for - detailed description): - - - Fn key combinations - - Bluetooth enable and disable - - Wlan enable and disable - - GPS enable and disable - - Video output switching - - Ambient Light Sensor on and off - - LED control - - LED Display control - - LCD brightness control - - LCD on and off - - A compatibility table by model and feature is maintained on the web - site, http://acpi4asus.sf.net/. - -Usage ------ - - Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should - see some lines like this : - - Asus Laptop Extras version 0.42 - - L2D model detected. - - If it is not the output you have on your laptop, send it (and the laptop's - DSDT) to me. - - That's all, now, all the events generated by the hotkeys of your laptop - should be reported via netlink events. You can check with - "acpi_genl monitor" (part of the acpica project). - - Hotkeys are also reported as input keys (like keyboards) you can check - which key are supported using "xev" under X11. - - You can get information on the version of your DSDT table by reading the - /sys/devices/platform/asus-laptop/infos entry. If you have a question or a - bug report to do, please include the output of this entry. - -LEDs ----- - - You can modify LEDs be echoing values to `/sys/class/leds/asus/*/brightness`:: - - echo 1 > /sys/class/leds/asus::mail/brightness - - will switch the mail LED on. - - You can also know if they are on/off by reading their content and use - kernel triggers like disk-activity or heartbeat. - -Backlight ---------- - - You can control lcd backlight power and brightness with - /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15. - -Wireless devices ----------------- - - You can turn the internal Bluetooth adapter on/off with the bluetooth entry - (only on models with Bluetooth). This usually controls the associated LED. - Same for Wlan adapter. - -Display switching ------------------ - - Note: the display switching code is currently considered EXPERIMENTAL. - - Switching works for the following models: - - - L3800C - - A2500H - - L5800C - - M5200N - - W1000N (albeit with some glitches) - - M6700R - - A6JC - - F3J - - Switching doesn't work for the following: - - - M3700N - - L2X00D (locks the laptop under certain conditions) - - To switch the displays, echo values from 0 to 15 to - /sys/devices/platform/asus-laptop/display. The significance of those values - is as follows: - - +-------+-----+-----+-----+-----+-----+ - | Bin | Val | DVI | TV | CRT | LCD | - +-------+-----+-----+-----+-----+-----+ - | 0000 | 0 | | | | | - +-------+-----+-----+-----+-----+-----+ - | 0001 | 1 | | | | X | - +-------+-----+-----+-----+-----+-----+ - | 0010 | 2 | | | X | | - +-------+-----+-----+-----+-----+-----+ - | 0011 | 3 | | | X | X | - +-------+-----+-----+-----+-----+-----+ - | 0100 | 4 | | X | | | - +-------+-----+-----+-----+-----+-----+ - | 0101 | 5 | | X | | X | - +-------+-----+-----+-----+-----+-----+ - | 0110 | 6 | | X | X | | - +-------+-----+-----+-----+-----+-----+ - | 0111 | 7 | | X | X | X | - +-------+-----+-----+-----+-----+-----+ - | 1000 | 8 | X | | | | - +-------+-----+-----+-----+-----+-----+ - | 1001 | 9 | X | | | X | - +-------+-----+-----+-----+-----+-----+ - | 1010 | 10 | X | | X | | - +-------+-----+-----+-----+-----+-----+ - | 1011 | 11 | X | | X | X | - +-------+-----+-----+-----+-----+-----+ - | 1100 | 12 | X | X | | | - +-------+-----+-----+-----+-----+-----+ - | 1101 | 13 | X | X | | X | - +-------+-----+-----+-----+-----+-----+ - | 1110 | 14 | X | X | X | | - +-------+-----+-----+-----+-----+-----+ - | 1111 | 15 | X | X | X | X | - +-------+-----+-----+-----+-----+-----+ - - In most cases, the appropriate displays must be plugged in for the above - combinations to work. TV-Out may need to be initialized at boot time. - - Debugging: - - 1) Check whether the Fn+F8 key: - - a) does not lock the laptop (try a boot with noapic / nolapic if it does) - b) generates events (0x6n, where n is the value corresponding to the - configuration above) - c) actually works - - Record the disp value at every configuration. - 2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display. - Record its value, note any change. If nothing changes, try a broader range, - up to 65535. - 3) Send ANY output (both positive and negative reports are needed, unless your - machine is already listed above) to the acpi4asus-user mailing list. - - Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n - events are generated and no actual switching occurs. In such a case, a line - like:: - - echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display - - will usually do the trick ($arg is the 0000006n-like event passed to acpid). - - Note: there is currently no reliable way to read display status on xxN - (Centrino) models. - -LED display ------------ - - Some models like the W1N have a LED display that can be used to display - several items of information. - - LED display works for the following models: - - - W1000N - - W1J - - To control the LED display, use the following:: - - echo 0x0T000DDD > /sys/devices/platform/asus-laptop/ - - where T control the 3 letters display, and DDD the 3 digits display, - according to the tables below:: - - DDD (digits) - 000 to 999 = display digits - AAA = --- - BBB to FFF = turn-off - - T (type) - 0 = off - 1 = dvd - 2 = vcd - 3 = mp3 - 4 = cd - 5 = tv - 6 = cpu - 7 = vol - - For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd" - would display "DVD001". - -Driver options --------------- - - Options can be passed to the asus-laptop driver using the standard - module argument syntax (= when passing the option to the - module or asus-laptop.= on the kernel boot line when - asus-laptop is statically linked into the kernel). - - wapf: WAPF defines the behavior of the Fn+Fx wlan key - The significance of values is yet to be found, but - most of the time: - - - 0x0 should do nothing - - 0x1 should allow to control the device with Fn+Fx key. - - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key - - 0x5 like 0x1 or 0x4 - - The default value is 0x1. - -Unsupported models ------------------- - - These models will never be supported by this module, as they use a completely - different mechanism to handle LEDs and extra stuff (meaning we have no clue - how it works): - - - ASUS A1300 (A1B), A1370D - - ASUS L7300G - - ASUS L8400 - -Patches, Errors, Questions --------------------------- - - I appreciate any success or failure - reports, especially if they add to or correct the compatibility table. - Please include the following information in your report: - - - Asus model name - - a copy of your ACPI tables, using the "acpidump" utility - - a copy of /sys/devices/platform/asus-laptop/infos - - which driver features work and which don't - - the observed behavior of non-working features - - Any other comments or patches are also more than welcome. - - acpi4asus-user@lists.sourceforge.net - - http://sourceforge.net/projects/acpi4asus diff --git a/Documentation/laptops/disk-shock-protection.rst b/Documentation/laptops/disk-shock-protection.rst deleted file mode 100644 index e97c5f78d8c3..000000000000 --- a/Documentation/laptops/disk-shock-protection.rst +++ /dev/null @@ -1,151 +0,0 @@ -========================== -Hard disk shock protection -========================== - -Author: Elias Oltmanns - -Last modified: 2008-10-03 - - -.. 0. Contents - - 1. Intro - 2. The interface - 3. References - 4. CREDITS - - -1. Intro --------- - -ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature. -Issuing this command should cause the drive to switch to idle mode and -unload disk heads. This feature is being used in modern laptops in -conjunction with accelerometers and appropriate software to implement -a shock protection facility. The idea is to stop all I/O operations on -the internal hard drive and park its heads on the ramp when critical -situations are anticipated. The desire to have such a feature -available on GNU/Linux systems has been the original motivation to -implement a generic disk head parking interface in the Linux kernel. -Please note, however, that other components have to be set up on your -system in order to get disk shock protection working (see -section 3. References below for pointers to more information about -that). - - -2. The interface ----------------- - -For each ATA device, the kernel exports the file -`block/*/device/unload_heads` in sysfs (here assumed to be mounted under -/sys). Access to `/sys/block/*/device/unload_heads` is denied with --EOPNOTSUPP if the device does not support the unload feature. -Otherwise, writing an integer value to this file will take the heads -of the respective drive off the platter and block all I/O operations -for the specified number of milliseconds. When the timeout expires and -no further disk head park request has been issued in the meantime, -normal operation will be resumed. The maximal value accepted for a -timeout is 30000 milliseconds. Exceeding this limit will return --EOVERFLOW, but heads will be parked anyway and the timeout will be -set to 30 seconds. However, you can always change a timeout to any -value between 0 and 30000 by issuing a subsequent head park request -before the timeout of the previous one has expired. In particular, the -total timeout can exceed 30 seconds and, more importantly, you can -cancel a previously set timeout and resume normal operation -immediately by specifying a timeout of 0. Values below -2 are rejected -with -EINVAL (see below for the special meaning of -1 and -2). If the -timeout specified for a recent head park request has not yet expired, -reading from `/sys/block/*/device/unload_heads` will report the number -of milliseconds remaining until normal operation will be resumed; -otherwise, reading the unload_heads attribute will return 0. - -For example, do the following in order to park the heads of drive -/dev/sda and stop all I/O operations for five seconds:: - - # echo 5000 > /sys/block/sda/device/unload_heads - -A simple:: - - # cat /sys/block/sda/device/unload_heads - -will show you how many milliseconds are left before normal operation -will be resumed. - -A word of caution: The fact that the interface operates on a basis of -milliseconds may raise expectations that cannot be satisfied in -reality. In fact, the ATA specs clearly state that the time for an -unload operation to complete is vendor specific. The hint in ATA-7 -that this will typically be within 500 milliseconds apparently has -been dropped in ATA-8. - -There is a technical detail of this implementation that may cause some -confusion and should be discussed here. When a head park request has -been issued to a device successfully, all I/O operations on the -controller port this device is attached to will be deferred. That is -to say, any other device that may be connected to the same port will -be affected too. The only exception is that a subsequent head unload -request to that other device will be executed immediately. Further -operations on that port will be deferred until the timeout specified -for either device on the port has expired. As far as PATA (old style -IDE) configurations are concerned, there can only be two devices -attached to any single port. In SATA world we have port multipliers -which means that a user-issued head parking request to one device may -actually result in stopping I/O to a whole bunch of devices. However, -since this feature is supposed to be used on laptops and does not seem -to be very useful in any other environment, there will be mostly one -device per port. Even if the CD/DVD writer happens to be connected to -the same port as the hard drive, it generally *should* recover just -fine from the occasional buffer under-run incurred by a head park -request to the HD. Actually, when you are using an ide driver rather -than its libata counterpart (i.e. your disk is called /dev/hda -instead of /dev/sda), then parking the heads of one drive (drive X) -will generally not affect the mode of operation of another drive -(drive Y) on the same port as described above. It is only when a port -reset is required to recover from an exception on drive Y that further -I/O operations on that drive (and the reset itself) will be delayed -until drive X is no longer in the parked state. - -Finally, there are some hard drives that only comply with an earlier -version of the ATA standard than ATA-7, but do support the unload -feature nonetheless. Unfortunately, there is no safe way Linux can -detect these devices, so you won't be able to write to the -unload_heads attribute. If you know that your device really does -support the unload feature (for instance, because the vendor of your -laptop or the hard drive itself told you so), then you can tell the -kernel to enable the usage of this feature for that drive by writing -the special value -1 to the unload_heads attribute:: - - # echo -1 > /sys/block/sda/device/unload_heads - -will enable the feature for /dev/sda, and giving -2 instead of -1 will -disable it again. - - -3. References -------------- - -There are several laptops from different vendors featuring shock -protection capabilities. As manufacturers have refused to support open -source development of the required software components so far, Linux -support for shock protection varies considerably between different -hardware implementations. Ideally, this section should contain a list -of pointers at different projects aiming at an implementation of shock -protection on different systems. Unfortunately, I only know of a -single project which, although still considered experimental, is fit -for use. Please feel free to add projects that have been the victims -of my ignorance. - -- http://www.thinkwiki.org/wiki/HDAPS - - See this page for information about Linux support of the hard disk - active protection system as implemented in IBM/Lenovo Thinkpads. - - -4. CREDITS ----------- - -This implementation of disk head parking has been inspired by a patch -originally published by Jon Escombe . My efforts -to develop an implementation of this feature that is fit to be merged -into mainline have been aided by various kernel developers, in -particular by Tejun Heo and Bartlomiej Zolnierkiewicz. diff --git a/Documentation/laptops/index.rst b/Documentation/laptops/index.rst deleted file mode 100644 index 001a30910d09..000000000000 --- a/Documentation/laptops/index.rst +++ /dev/null @@ -1,17 +0,0 @@ -:orphan: - -============== -Laptop Drivers -============== - -.. toctree:: - :maxdepth: 1 - - asus-laptop - disk-shock-protection - laptop-mode - lg-laptop - sony-laptop - sonypi - thinkpad-acpi - toshiba_haps diff --git a/Documentation/laptops/laptop-mode.rst b/Documentation/laptops/laptop-mode.rst deleted file mode 100644 index c984c4262f2e..000000000000 --- a/Documentation/laptops/laptop-mode.rst +++ /dev/null @@ -1,781 +0,0 @@ -=============================================== -How to conserve battery power using laptop-mode -=============================================== - -Document Author: Bart Samwel (bart@samwel.tk) - -Date created: January 2, 2004 - -Last modified: December 06, 2004 - -Introduction ------------- - -Laptop mode is used to minimize the time that the hard disk needs to be spun up, -to conserve battery power on laptops. It has been reported to cause significant -power savings. - -.. Contents - - * Introduction - * Installation - * Caveats - * The Details - * Tips & Tricks - * Control script - * ACPI integration - * Monitoring tool - - -Installation ------------- - -To use laptop mode, you don't need to set any kernel configuration options -or anything. Simply install all the files included in this document, and -laptop mode will automatically be started when you're on battery. For -your convenience, a tarball containing an installer can be downloaded at: - - http://www.samwel.tk/laptop_mode/laptop_mode/ - -To configure laptop mode, you need to edit the configuration file, which is -located in /etc/default/laptop-mode on Debian-based systems, or in -/etc/sysconfig/laptop-mode on other systems. - -Unfortunately, automatic enabling of laptop mode does not work for -laptops that don't have ACPI. On those laptops, you need to start laptop -mode manually. To start laptop mode, run "laptop_mode start", and to -stop it, run "laptop_mode stop". (Note: The laptop mode tools package now -has experimental support for APM, you might want to try that first.) - - -Caveats -------- - -* The downside of laptop mode is that you have a chance of losing up to 10 - minutes of work. If you cannot afford this, don't use it! The supplied ACPI - scripts automatically turn off laptop mode when the battery almost runs out, - so that you won't lose any data at the end of your battery life. - -* Most desktop hard drives have a very limited lifetime measured in spindown - cycles, typically about 50.000 times (it's usually listed on the spec sheet). - Check your drive's rating, and don't wear down your drive's lifetime if you - don't need to. - -* If you mount some of your ext3/reiserfs filesystems with the -n option, then - the control script will not be able to remount them correctly. You must set - DO_REMOUNTS=0 in the control script, otherwise it will remount them with the - wrong options -- or it will fail because it cannot write to /etc/mtab. - -* If you have your filesystems listed as type "auto" in fstab, like I did, then - the control script will not recognize them as filesystems that need remounting. - You must list the filesystems with their true type instead. - -* It has been reported that some versions of the mutt mail client use file access - times to determine whether a folder contains new mail. If you use mutt and - experience this, you must disable the noatime remounting by setting the option - DO_REMOUNT_NOATIME to 0 in the configuration file. - - -The Details ------------ - -Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is -present for all kernels that have the laptop mode patch, regardless of any -configuration options. When the knob is set, any physical disk I/O (that might -have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The -result of this is that after a disk has spun down, it will not be spun up -anymore to write dirty blocks, because those blocks had already been written -immediately after the most recent read operation. The value of the laptop_mode -knob determines the time between the occurrence of disk I/O and when the flush -is triggered. A sensible value for the knob is 5 seconds. Setting the knob to -0 disables laptop mode. - -To increase the effectiveness of the laptop_mode strategy, the laptop_mode -control script increases dirty_expire_centisecs and dirty_writeback_centisecs in -/proc/sys/vm to about 10 minutes (by default), which means that pages that are -dirtied are not forced to be written to disk as often. The control script also -changes the dirty background ratio, so that background writeback of dirty pages -is not done anymore. Combined with a higher commit value (also 10 minutes) for -ext3 or ReiserFS filesystems (also done automatically by the control script), -this results in concentration of disk activity in a small time interval which -occurs only once every 10 minutes, or whenever the disk is forced to spin up by -a cache miss. The disk can then be spun down in the periods of inactivity. - -If you want to find out which process caused the disk to spin up, you can -gather information by setting the flag /proc/sys/vm/block_dump. When this flag -is set, Linux reports all disk read and write operations that take place, and -all block dirtyings done to files. This makes it possible to debug why a disk -needs to spin up, and to increase battery life even more. The output of -block_dump is written to the kernel output, and it can be retrieved using -"dmesg". When you use block_dump and your kernel logging level also includes -kernel debugging messages, you probably want to turn off klogd, otherwise -the output of block_dump will be logged, causing disk activity that is not -normally there. - - -Configuration -------------- - -The laptop mode configuration file is located in /etc/default/laptop-mode on -Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It -contains the following options: - -MAX_AGE: - -Maximum time, in seconds, of hard drive spindown time that you are -comfortable with. Worst case, it's possible that you could lose this -amount of work if your battery fails while you're in laptop mode. - -MINIMUM_BATTERY_MINUTES: - -Automatically disable laptop mode if the remaining number of minutes of -battery power is less than this value. Default is 10 minutes. - -AC_HD/BATT_HD: - -The idle timeout that should be set on your hard drive when laptop mode -is active (BATT_HD) and when it is not active (AC_HD). The defaults are -20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The -possible values are those listed in the manual page for "hdparm" for the -"-S" option. - -HD: - -The devices for which the spindown timeout should be adjusted by laptop mode. -Default is /dev/hda. If you specify multiple devices, separate them by a space. - -READAHEAD: - -Disk readahead, in 512-byte sectors, while laptop mode is active. A large -readahead can prevent disk accesses for things like executable pages (which are -loaded on demand while the application executes) and sequentially accessed data -(MP3s). - -DO_REMOUNTS: - -The control script automatically remounts any mounted journaled filesystems -with appropriate commit interval options. When this option is set to 0, this -feature is disabled. - -DO_REMOUNT_NOATIME: - -When remounting, should the filesystems be remounted with the noatime option? -Normally, this is set to "1" (enabled), but there may be programs that require -access time recording. - -DIRTY_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -before a writeback is forced, while laptop mode is active. Corresponds to -the /proc/sys/vm/dirty_ratio sysctl. - -DIRTY_BACKGROUND_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set -this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio -sysctl. - -Note that the behaviour of dirty_background_ratio is quite different -when laptop mode is active and when it isn't. When laptop mode is inactive, -dirty_background_ratio is the threshold percentage at which background writeouts -start taking place. When laptop mode is active, however, background writeouts -are disabled, and the dirty_background_ratio only determines how much writeback -is done when dirty_ratio is reached. - -DO_CPU: - -Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) - -CPU_MAXFREQ: - -When on battery, what is the maximum CPU speed that the system should use? Legal -values are "slowest" for the slowest speed that your CPU is able to operate at, -or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. - - -Tips & Tricks -------------- - -* Bartek Kania reports getting up to 50 minutes of extra battery life (on top - of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). - -* You can spin down the disk while playing MP3, by setting disk readahead - to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at - once, and will then spin down while the MP3 is playing. (Thanks to Bartek - Kania.) - -* Drew Scott Daniels observed: "I don't know why, but when I decrease the number - of colours that my display uses it consumes less battery power. I've seen - this on powerbooks too. I hope that this is a piece of information that - might be useful to the Laptop Mode patch or its users." - -* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the - file after every logging. When you're using laptop-mode and your disk doesn't - spin down, this is a likely culprit. - -* Richard Atterer observed that laptop mode does not work well with noflushd - (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode - from doing its thing. - -* If you're worried about your data, you might want to consider using a USB - memory stick or something like that as a "working area". (Be aware though - that flash memory can only handle a limited number of writes, and overuse - may wear out your memory stick pretty quickly. Do _not_ use journalling - filesystems on flash memory sticks.) - - -Configuration file for control and ACPI battery scripts -------------------------------------------------------- - -This allows the tunables to be changed for the scripts via an external -configuration file - -It should be installed as /etc/default/laptop-mode on Debian, and as -/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. - -Config file:: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - #MAX_AGE=600 - - # Automatically disable laptop mode when the number of minutes of battery - # that you have left goes below this threshold. - MINIMUM_BATTERY_MINUTES=10 - - # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG - # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk - # will read a complete MP3 at once, and will then spin down while the MP3/OGG is - # playing. - #READAHEAD=4096 - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - #DO_REMOUNTS=1 - - # And shall we add the "noatime" option to that as well? (1=yes) - #DO_REMOUNT_NOATIME=1 - - # Dirty synchronous ratio. At this percentage of dirty pages the process - # which - # calls write() does its own writeback - #DIRTY_RATIO=40 - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - #DIRTY_BACKGROUND_RATIO=5 - - # kernel default dirty buffer age - #DEF_AGE=30 - #DEF_UPDATE=5 - #DEF_DIRTY_BACKGROUND_RATIO=10 - #DEF_DIRTY_RATIO=40 - #DEF_XFS_AGE_BUFFER=15 - #DEF_XFS_SYNC_INTERVAL=30 - #DEF_XFS_BUFD_INTERVAL=1 - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still - # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for - # external interfaces, and that is currently always set to 100. So you don't - # need to change this on 2.6. - #XFS_HZ=100 - - # Should the maximum CPU frequency be adjusted down while on battery? - # Requires CPUFreq to be setup. - # See Documentation/admin-guide/pm/cpufreq.rst for more info - #DO_CPU=0 - - # When on battery what is the maximum CPU speed that the system should - # use? Legal values are "slowest" for the slowest speed that your - # CPU is able to operate at, or a value listed in: - # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies - # Only applicable if DO_CPU=1. - #CPU_MAXFREQ=slowest - - # Idle timeout for your hard drive (man hdparm for valid values, -S option) - # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). - #AC_HD=244 - #BATT_HD=4 - - # The drives for which to adjust the idle timeout. Separate them by a space, - # e.g. HD="/dev/hda /dev/hdb". - #HD="/dev/hda" - - # Set the spindown timeout on a hard drive? - #DO_HD=1 - - -Control script --------------- - -Please note that this control script works for the Linux 2.4 and 2.6 series (thanks -to Kiko Piris). - -Control script:: - - #!/bin/bash - - # start or stop laptop_mode, best run by a power management daemon when - # ac gets connected/disconnected from a laptop - # - # install as /sbin/laptop_mode - # - # Contributors to this script: Kiko Piris - # Bart Samwel - # Micha Feigin - # Andrew Morton - # Herve Eychenne - # Dax Kelson - # - # Original Linux 2.4 version by: Jens Axboe - - ############################################################################# - - # Source config - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - - # Don't raise an error if the config file is incomplete - # set defaults instead: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - MAX_AGE=${MAX_AGE:-'600'} - - # Read-ahead, in kilobytes - READAHEAD=${READAHEAD:-'4096'} - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - DO_REMOUNTS=${DO_REMOUNTS:-'1'} - - # And shall we add the "noatime" option to that as well? (1=yes) - DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} - - # Shall we adjust the idle timeout on a hard drive? - DO_HD=${DO_HD:-'1'} - - # Adjust idle timeout on which hard drive? - HD="${HD:-'/dev/hda'}" - - # spindown time for HD (hdparm -S values) - AC_HD=${AC_HD:-'244'} - BATT_HD=${BATT_HD:-'4'} - - # Dirty synchronous ratio. At this percentage of dirty pages the process which - # calls write() does its own writeback - DIRTY_RATIO=${DIRTY_RATIO:-'40'} - - # cpu frequency scaling - # See Documentation/admin-guide/pm/cpufreq.rst for more info - DO_CPU=${CPU_MANAGE:-'0'} - CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} - - # kernel default dirty buffer age - DEF_AGE=${DEF_AGE:-'30'} - DEF_UPDATE=${DEF_UPDATE:-'5'} - DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} - DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} - DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} - DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} - DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still needs - # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external - # interfaces, and that is currently always set to 100. So you don't need to - # change this on 2.6. - XFS_HZ=${XFS_HZ:-'100'} - - ############################################################################# - - KLEVEL="$(uname -r | - { - IFS='.' read a b c - echo $a.$b - } - )" - case "$KLEVEL" in - "2.4"|"2.6") - ;; - *) - echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 - exit 1 - ;; - esac - - if [ ! -e /proc/sys/vm/laptop_mode ] ; then - echo "Kernel is not patched with laptop_mode patch." >&2 - exit 1 - fi - - if [ ! -w /proc/sys/vm/laptop_mode ] ; then - echo "You do not have enough privileges to enable laptop_mode." >&2 - exit 1 - fi - - # Remove an option (the first parameter) of the form option= from - # a mount options string (the rest of the parameters). - parse_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"'=[0-9]*,/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Remove an option (the first parameter) without any arguments from - # a mount option string (the rest of the parameters). - parse_nonumber_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"',/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Find out the state of a yes/no option (e.g. "atime"/"noatime") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, the option name the second, and the default - # value the third. The remainder is the mount options string. - # - # Example: - # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime - # - # If fstab contains, say, "rw" for this filesystem, then the result - # will be "defaults,atime". - parse_yesno_opts_wfstab () { - L_DEV="$1" - OPT="$2" - DEF_OPT="$3" - shift 3 - L_OPTS="$*" - PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" - PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" - # Watch for a default atime in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then - # option specified in fstab: extract the value and use it - if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then - echo "$PARSEDOPTS1,no$OPT" - else - # no$OPT not found -- so we must have $OPT. - echo "$PARSEDOPTS1,$OPT" - fi - else - # option not specified in fstab -- choose the default. - echo "$PARSEDOPTS1,$DEF_OPT" - fi - } - - # Find out the state of a numbered option (e.g. "commit=NNN") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, and the option name the second. The - # remainder is the mount options string in which the replacement - # must be done. - # - # Example: - # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 - # - # If fstab contains, say, "commit=3,rw" for this filesystem, then the - # result will be "rw,commit=3". - parse_mount_opts_wfstab () { - L_DEV="$1" - OPT="$2" - shift 2 - L_OPTS="$*" - PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" - # Watch for a default commit in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then - # option specified in fstab: extract the value, and use it - echo -n "$PARSEDOPTS1,$OPT=" - echo ",$FSTAB_OPTS," | sed \ - -e 's/.*,'"$OPT"'=//' \ - -e 's/,.*//' - else - # option not specified in fstab: set it to 0 - echo "$PARSEDOPTS1,$OPT=0" - fi - } - - deduce_fstype () { - MP="$1" - # My root filesystem unfortunately has - # type "unknown" in /etc/mtab. If we encounter - # "unknown", we try to get the type from fstab. - cat /etc/fstab | - grep -v '^#' | - while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do - if [ "$FSTAB_MP" = "$MP" ]; then - echo $FSTAB_FST - exit 0 - fi - done - } - - if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then - NOATIME_OPT=",noatime" - fi - - case "$1" in - start) - AGE=$((100*$MAX_AGE)) - XFS_AGE=$(($XFS_HZ*$MAX_AGE)) - echo -n "Starting laptop_mode" - - if [ -d /proc/sys/vm/pagebuf ] ; then - # (For 2.4 and early 2.6.) - # This only needs to be set, not reset -- it is only used when - # laptop mode is enabled. - echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # (A couple of early 2.6 laptop mode patches had these.) - # The same goes for these. - echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then - # (2.6.6) - # But not for these -- they are also used in normal - # operation. - echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # (2.6.7 upwards) - # And not for these either. These are in centisecs, - # not USER_HZ, so we have to use $AGE, not $XFS_AGE. - echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs - echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs - echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - - case "$KLEVEL" in - "2.4") - echo 1 > /proc/sys/vm/laptop_mode - echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo 5 > /proc/sys/vm/laptop_mode - echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ]; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - PARSEDOPTS="$(parse_mount_opts "$OPTS")" - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3"|"reiserfs") - PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT - ;; - "xfs") - mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra $(($READAHEAD * 2)) $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - if [ $CPU_MAXFREQ = 'slowest' ]; then - CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` - fi - echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - stop) - U_AGE=$((100*$DEF_UPDATE)) - B_AGE=$((100*$DEF_AGE)) - echo -n "Stopping laptop_mode" - echo 0 > /proc/sys/vm/laptop_mode - if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # These need to be restored, if there are no lm_*. - echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer - echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # These need to be restored as well. - echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs - echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs - echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - case "$KLEVEL" in - "2.4") - echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ] ; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - # Reset commit and atime options to defaults. - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3"|"reiserfs") - PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - "xfs") - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra 256 $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - *) - echo "Usage: $0 {start|stop}" 2>&1 - exit 1 - ;; - - esac - - exit 0 - - -ACPI integration ----------------- - -Dax Kelson submitted this so that the ACPI acpid daemon will -kick off the laptop_mode script and run hdparm. The part that -automatically disables laptop mode when the battery is low was -written by Jan Topinski. - -/etc/acpi/events/ac_adapter:: - - event=ac_adapter - action=/etc/acpi/actions/ac.sh %e - -/etc/acpi/events/battery:: - - event=battery.* - action=/etc/acpi/actions/battery.sh %e - -/etc/acpi/actions/ac.sh:: - - #!/bin/bash - - # ac on/offline event handler - - status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` - - case $status in - "on-line") - /sbin/laptop_mode stop - exit 0 - ;; - "off-line") - /sbin/laptop_mode start - exit 0 - ;; - esac - - -/etc/acpi/actions/battery.sh:: - - #! /bin/bash - - # Automatically disable laptop mode when the battery almost runs out. - - BATT_INFO=/proc/acpi/battery/$2/state - - if [[ -f /proc/sys/vm/laptop_mode ]] - then - LM=`cat /proc/sys/vm/laptop_mode` - if [[ $LM -gt 0 ]] - then - if [[ -f $BATT_INFO ]] - then - # Source the config file only now that we know we need - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} - - ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" - if [[ ACTION -eq "discharging" ]] - then - PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - fi - if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) - then - /sbin/laptop_mode stop - fi - else - logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." - fi - fi - fi - - -Monitoring tool ---------------- - -Bartek Kania submitted this, it can be used to measure how much time your disk -spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst deleted file mode 100644 index f2c2ffe31101..000000000000 --- a/Documentation/laptops/lg-laptop.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0+ - -:orphan: - -LG Gram laptop extra features -============================= - -By Matan Ziv-Av - - -Hotkeys -------- - -The following FN keys are ignored by the kernel without this driver: - -- FN-F1 (LG control panel) - Generates F15 -- FN-F5 (Touchpad toggle) - Generates F13 -- FN-F6 (Airplane mode) - Generates RFKILL -- FN-F8 (Keyboard backlight) - Generates F16. - This key also changes keyboard backlight mode. -- FN-F9 (Reader mode) - Generates F14 - -The rest of the FN keys work without a need for a special driver. - - -Reader mode ------------ - -Writing 0/1 to /sys/devices/platform/lg-laptop/reader_mode disables/enables -reader mode. In this mode the screen colors change (blue color reduced), -and the reader mode indicator LED (on F9 key) turns on. - - -FN Lock -------- - -Writing 0/1 to /sys/devices/platform/lg-laptop/fn_lock disables/enables -FN lock. - - -Battery care limit ------------------- - -Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit -sets the maximum capacity to charge the battery. Limiting the charge -reduces battery capacity loss over time. - -This value is reset to 100 when the kernel boots. - - -Fan mode --------- - -Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables -the fan silent mode. - - -USB charge ----------- - -Writing 0/1 to /sys/devices/platform/lg-laptop/usb_charge disables/enables -charging another device from the USB port while the device is turned off. - -This value is reset to 0 when the kernel boots. - - -LEDs -~~~~ - -The are two LED devices supported by the driver: - -Keyboard backlight ------------------- - -A led device named kbd_led controls the keyboard backlight. There are three -lighting level: off (0), low (127) and high (255). - -The keyboard backlight is also controlled by the key combination FN-F8 -which cycles through those levels. - - -Touchpad indicator LED ----------------------- - -On the F5 key. Controlled by led device names tpad_led. diff --git a/Documentation/laptops/sony-laptop.rst b/Documentation/laptops/sony-laptop.rst deleted file mode 100644 index 9edcc7f6612f..000000000000 --- a/Documentation/laptops/sony-laptop.rst +++ /dev/null @@ -1,174 +0,0 @@ -========================================= -Sony Notebook Control Driver (SNC) Readme -========================================= - - - Copyright (C) 2004- 2005 Stelian Pop - - Copyright (C) 2007 Mattia Dongili - -This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the -Sony Vaio laptops. This driver mixes both devices functions under the same -(hopefully consistent) interface. This also means that the sonypi driver is -obsoleted by sony-laptop now. - -Fn keys (hotkeys): ------------------- - -Some models report hotkeys through the SNC or SPIC devices, such events are -reported both through the ACPI subsystem as acpi events and through the INPUT -subsystem. See the logs of /proc/bus/input/devices to find out what those -events are and which input devices are created by the driver. -Additionally, loading the driver with the debug option will report all events -in the kernel log. - -The "scancodes" passed to the input system (that can be remapped with udev) -are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c -module. For example the "FN/E" key combination (EJECTCD on some models) -generates the scancode 20 (0x14). - -Backlight control: ------------------- -If your laptop model supports it, you will find sysfs files in the -/sys/class/backlight/sony/ -directory. You will be able to query and set the current screen -brightness: - - ====================== ========================================= - brightness get/set screen brightness (an integer - between 0 and 7) - actual_brightness reading from this file will query the HW - to get real brightness value - max_brightness the maximum brightness value - ====================== ========================================= - - -Platform specific: ------------------- -Loading the sony-laptop module will create a -/sys/devices/platform/sony-laptop/ -directory populated with some files. - -You then read/write integer values from/to those files by using -standard UNIX tools. - -The files are: - - ====================== ========================================== - brightness_default screen brightness which will be set - when the laptop will be rebooted - cdpower power on/off the internal CD drive - audiopower power on/off the internal sound card - lanpower power on/off the internal ethernet card - (only in debug mode) - bluetoothpower power on/off the internal bluetooth device - fanspeed get/set the fan speed - ====================== ========================================== - -Note that some files may be missing if they are not supported -by your particular laptop model. - -Example usage:: - - # echo "1" > /sys/devices/platform/sony-laptop/brightness_default - -sets the lowest screen brightness for the next and later reboots - -:: - - # echo "8" > /sys/devices/platform/sony-laptop/brightness_default - -sets the highest screen brightness for the next and later reboots - -:: - - # cat /sys/devices/platform/sony-laptop/brightness_default - -retrieves the value - -:: - - # echo "0" > /sys/devices/platform/sony-laptop/audiopower - -powers off the sound card - -:: - - # echo "1" > /sys/devices/platform/sony-laptop/audiopower - -powers on the sound card. - - -RFkill control: ---------------- -More recent Vaio models expose a consistent set of ACPI methods to -control radio frequency emitting devices. If you are a lucky owner of -such a laptop you will find the necessary rfkill devices under -/sys/class/rfkill. Check those starting with sony-* in:: - - # grep . /sys/class/rfkill/*/{state,name} - - -Development: ------------- - -If you want to help with the development of this driver (and -you are not afraid of any side effects doing strange things with -your ACPI BIOS could have on your laptop), load the driver and -pass the option 'debug=1'. - -REPEAT: - **DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.** - -In your kernel logs you will find the list of all ACPI methods -the SNC device has on your laptop. - -* For new models you will see a long list of meaningless method names, - reading the DSDT table source should reveal that: - -(1) the SNC device uses an internal capability lookup table -(2) SN00 is used to find values in the lookup table -(3) SN06 and SN07 are used to call into the real methods based on - offsets you can obtain iterating the table using SN00 -(4) SN02 used to enable events. - -Some values in the capability lookup table are more or less known, see -the code for all sony_call_snc_handle calls, others are more obscure. - -* For old models you can see the GCDP/GCDP methods used to pwer on/off - the CD drive, but there are others and they are usually different from - model to model. - -**I HAVE NO IDEA WHAT THOSE METHODS DO.** - -The sony-laptop driver creates, for some of those methods (the most -current ones found on several Vaio models), an entry under -/sys/devices/platform/sony-laptop, just like the 'cdpower' one. -You can create other entries corresponding to your own laptop methods by -further editing the source (see the 'sony_nc_values' table, and add a new -entry to this table with your get/set method names using the -SNC_HANDLE_NAMES macro). - -Your mission, should you accept it, is to try finding out what -those entries are for, by reading/writing random values from/to those -files and find out what is the impact on your laptop. - -Should you find anything interesting, please report it back to me, -I will not disavow all knowledge of your actions :) - -See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other -useful info. - -Bugs/Limitations: ------------------ - -* This driver is not based on official documentation from Sony - (because there is none), so there is no guarantee this driver - will work at all, or do the right thing. Although this hasn't - happened to me, this driver could do very bad things to your - laptop, including permanent damage. - -* The sony-laptop and sonypi drivers do not interact at all. In the - future, sonypi will be removed and replaced by sony-laptop. - -* spicctrl, which is the userspace tool used to communicate with the - sonypi driver (through /dev/sonypi) is deprecated as well since all - its features are now available under the sysfs tree via sony-laptop. diff --git a/Documentation/laptops/sonypi.rst b/Documentation/laptops/sonypi.rst deleted file mode 100644 index 2a1975ed7ee4..000000000000 --- a/Documentation/laptops/sonypi.rst +++ /dev/null @@ -1,160 +0,0 @@ -================================================== -Sony Programmable I/O Control Device Driver Readme -================================================== - - - Copyright (C) 2001-2004 Stelian Pop - - Copyright (C) 2001-2002 Alcôve - - Copyright (C) 2001 Michael Ashley - - Copyright (C) 2001 Junichi Morita - - Copyright (C) 2000 Takaya Kinjo - - Copyright (C) 2000 Andrew Tridgell - -This driver enables access to the Sony Programmable I/O Control Device which -can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be -limited to new FX series laptops, at least the FX501 and the FX702) lack a -sonypi device and are not supported at all by this driver. - -It will give access (through a user space utility) to some events those laptops -generate, like: - - - jogdial events (the small wheel on the side of Vaios) - - capture button events (only on Vaio Picturebook series) - - Fn keys - - bluetooth button (only on C1VR model) - - programmable keys, back, help, zoom, thumbphrase buttons, etc. - (when available) - -Those events (see linux/sonypi.h) can be polled using the character device node -/dev/sonypi (major 10, minor auto allocated or specified as a option). -A simple daemon which translates the jogdial movements into mouse wheel events -can be downloaded at: - -Another option to intercept the events is to get them directly through the -input layer. - -This driver supports also some ioctl commands for setting the LCD screen -brightness and querying the batteries charge information (some more -commands may be added in the future). - -This driver can also be used to set the camera controls on Picturebook series -(brightness, contrast etc), and is used by the video4linux driver for the -Motion Eye camera. - -Please note that this driver was created by reverse engineering the Windows -driver and the ACPI BIOS, because Sony doesn't agree to release any programming -specs for its laptops. If someone convinces them to do so, drop me a note. - -Driver options: ---------------- - -Several options can be passed to the sonypi driver using the standard -module argument syntax (= when passing the option to the -module or sonypi.= on the kernel boot line when sonypi is -statically linked into the kernel). Those options are: - - =============== ======================================================= - minor: minor number of the misc device /dev/sonypi, - default is -1 (automatic allocation, see /proc/misc - or kernel logs) - - camera: if you have a PictureBook series Vaio (with the - integrated MotionEye camera), set this parameter to 1 - in order to let the driver access to the camera - - fnkeyinit: on some Vaios (C1VE, C1VR etc), the Fn key events don't - get enabled unless you set this parameter to 1. - Do not use this option unless it's actually necessary, - some Vaio models don't deal well with this option. - This option is available only if the kernel is - compiled without ACPI support (since it conflicts - with it and it shouldn't be required anyway if - ACPI is already enabled). - - verbose: set to 1 to print unknown events received from the - sonypi device. - set to 2 to print all events received from the - sonypi device. - - compat: uses some compatibility code for enabling the sonypi - events. If the driver worked for you in the past - (prior to version 1.5) and does not work anymore, - add this option and report to the author. - - mask: event mask telling the driver what events will be - reported to the user. This parameter is required for - some Vaio models where the hardware reuses values - used in other Vaio models (like the FX series who does - not have a jogdial but reuses the jogdial events for - programmable keys events). The default event mask is - set to 0xffffffff, meaning that all possible events - will be tried. You can use the following bits to - construct your own event mask (from - drivers/char/sonypi.h): - - ======================== ====== - SONYPI_JOGGER_MASK 0x0001 - SONYPI_CAPTURE_MASK 0x0002 - SONYPI_FNKEY_MASK 0x0004 - SONYPI_BLUETOOTH_MASK 0x0008 - SONYPI_PKEY_MASK 0x0010 - SONYPI_BACK_MASK 0x0020 - SONYPI_HELP_MASK 0x0040 - SONYPI_LID_MASK 0x0080 - SONYPI_ZOOM_MASK 0x0100 - SONYPI_THUMBPHRASE_MASK 0x0200 - SONYPI_MEYE_MASK 0x0400 - SONYPI_MEMORYSTICK_MASK 0x0800 - SONYPI_BATTERY_MASK 0x1000 - SONYPI_WIRELESS_MASK 0x2000 - ======================== ====== - - useinput: if set (which is the default) two input devices are - created, one which interprets the jogdial events as - mouse events, the other one which acts like a - keyboard reporting the pressing of the special keys. - =============== ======================================================= - -Module use: ------------ - -In order to automatically load the sonypi module on use, you can put those -lines a configuration file in /etc/modprobe.d/:: - - alias char-major-10-250 sonypi - options sonypi minor=250 - -This supposes the use of minor 250 for the sonypi device:: - - # mknod /dev/sonypi c 10 250 - -Bugs: ------ - - - several users reported that this driver disables the BIOS-managed - Fn-keys which put the laptop in sleeping state, or switch the - external monitor on/off. There is no workaround yet, since this - driver disables all APM management for those keys, by enabling the - ACPI management (and the ACPI core stuff is not complete yet). If - you have one of those laptops with working Fn keys and want to - continue to use them, don't use this driver. - - - some users reported that the laptop speed is lower (dhrystone - tested) when using the driver with the fnkeyinit parameter. I cannot - reproduce it on my laptop and not all users have this problem. - This happens because the fnkeyinit parameter enables the ACPI - mode (but without additional ACPI control, like processor - speed handling etc). Use ACPI instead of APM if it works on your - laptop. - - - sonypi lacks the ability to distinguish between certain key - events on some models. - - - some models with the nvidia card (geforce go 6200 tc) uses a - different way to adjust the backlighting of the screen. There - is a userspace utility to adjust the brightness on those models, - which can be downloaded from - http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2 - - - since all development was done by reverse engineering, there is - *absolutely no guarantee* that this driver will not crash your - laptop. Permanently. diff --git a/Documentation/laptops/thinkpad-acpi.rst b/Documentation/laptops/thinkpad-acpi.rst deleted file mode 100644 index 19d52fc3c5e9..000000000000 --- a/Documentation/laptops/thinkpad-acpi.rst +++ /dev/null @@ -1,1562 +0,0 @@ -=========================== -ThinkPad ACPI Extras Driver -=========================== - -Version 0.25 - -October 16th, 2013 - -- Borislav Deianov -- Henrique de Moraes Holschuh - -http://ibm-acpi.sf.net/ - -This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It -supports various features of these laptops which are accessible -through the ACPI and ACPI EC framework, but not otherwise fully -supported by the generic Linux ACPI drivers. - -This driver used to be named ibm-acpi until kernel 2.6.21 and release -0.13-20070314. It used to be in the drivers/acpi tree, but it was -moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel -2.6.22, and release 0.14. It was moved to drivers/platform/x86 for -kernel 2.6.29 and release 0.22. - -The driver is named "thinkpad-acpi". In some places, like module -names and log messages, "thinkpad_acpi" is used because of userspace -issues. - -"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too -long due to length limitations on some Linux kernel versions. - -Status ------- - -The features currently supported are the following (see below for -detailed description): - - - Fn key combinations - - Bluetooth enable and disable - - video output switching, expansion control - - ThinkLight on and off - - CMOS/UCMS control - - LED control - - ACPI sounds - - temperature sensors - - Experimental: embedded controller register dump - - LCD brightness control - - Volume control - - Fan control and monitoring: fan speed, fan enable/disable - - WAN enable and disable - - UWB enable and disable - -A compatibility table by model and feature is maintained on the web -site, http://ibm-acpi.sf.net/. I appreciate any success or failure -reports, especially if they add to or correct the compatibility table. -Please include the following information in your report: - - - ThinkPad model name - - a copy of your ACPI tables, using the "acpidump" utility - - a copy of the output of dmidecode, with serial numbers - and UUIDs masked off - - which driver features work and which don't - - the observed behavior of non-working features - -Any other comments or patches are also more than welcome. - - -Installation ------------- - -If you are compiling this driver as included in the Linux kernel -sources, look for the CONFIG_THINKPAD_ACPI Kconfig option. -It is located on the menu path: "Device Drivers" -> "X86 Platform -Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras". - - -Features --------- - -The driver exports two different interfaces to userspace, which can be -used to access the features it provides. One is a legacy procfs-based -interface, which will be removed at some time in the future. The other -is a new sysfs-based interface which is not complete yet. - -The procfs interface creates the /proc/acpi/ibm directory. There is a -file under that directory for each feature it supports. The procfs -interface is mostly frozen, and will change very little if at all: it -will not be extended to add any new functionality in the driver, instead -all new functionality will be implemented on the sysfs interface. - -The sysfs interface tries to blend in the generic Linux sysfs subsystems -and classes as much as possible. Since some of these subsystems are not -yet ready or stabilized, it is expected that this interface will change, -and any and all userspace programs must deal with it. - - -Notes about the sysfs interface -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Unlike what was done with the procfs interface, correctness when talking -to the sysfs interfaces will be enforced, as will correctness in the -thinkpad-acpi's implementation of sysfs interfaces. - -Also, any bugs in the thinkpad-acpi sysfs driver code or in the -thinkpad-acpi's implementation of the sysfs interfaces will be fixed for -maximum correctness, even if that means changing an interface in -non-compatible ways. As these interfaces mature both in the kernel and -in thinkpad-acpi, such changes should become quite rare. - -Applications interfacing to the thinkpad-acpi sysfs interfaces must -follow all sysfs guidelines and correctly process all errors (the sysfs -interface makes extensive use of errors). File descriptors and open / -close operations to the sysfs inodes must also be properly implemented. - -The version of thinkpad-acpi's sysfs interface is exported by the driver -as a driver attribute (see below). - -Sysfs driver attributes are on the driver's sysfs attribute space, -for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and -/sys/bus/platform/drivers/thinkpad_hwmon/ - -Sysfs device attributes are on the thinkpad_acpi device sysfs attribute -space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/. - -Sysfs device attributes for the sensors and fan are on the -thinkpad_hwmon device's sysfs attribute space, but you should locate it -looking for a hwmon device with the name attribute of "thinkpad", or -better yet, through libsensors. For 4.14+ sysfs attributes were moved to the -hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or -/sys/class/hwmon/hwmon?). - -Driver version --------------- - -procfs: /proc/acpi/ibm/driver - -sysfs driver attribute: version - -The driver name and version. No commands can be written to this file. - - -Sysfs interface version ------------------------ - -sysfs driver attribute: interface_version - -Version of the thinkpad-acpi sysfs interface, as an unsigned long -(output in hex format: 0xAAAABBCC), where: - - AAAA - - major revision - BB - - minor revision - CC - - bugfix revision - -The sysfs interface version changelog for the driver can be found at the -end of this document. Changes to the sysfs interface done by the kernel -subsystems are not documented here, nor are they tracked by this -attribute. - -Changes to the thinkpad-acpi sysfs interface are only considered -non-experimental when they are submitted to Linux mainline, at which -point the changes in this interface are documented and interface_version -may be updated. If you are using any thinkpad-acpi features not yet -sent to mainline for merging, you do so on your own risk: these features -may disappear, or be implemented in a different and incompatible way by -the time they are merged in Linux mainline. - -Changes that are backwards-compatible by nature (e.g. the addition of -attributes that do not change the way the other attributes work) do not -always warrant an update of interface_version. Therefore, one must -expect that an attribute might not be there, and deal with it properly -(an attribute not being there *is* a valid way to make it clear that a -feature is not available in sysfs). - - -Hot keys --------- - -procfs: /proc/acpi/ibm/hotkey - -sysfs device attribute: hotkey_* - -In a ThinkPad, the ACPI HKEY handler is responsible for communicating -some important events and also keyboard hot key presses to the operating -system. Enabling the hotkey functionality of thinkpad-acpi signals the -firmware that such a driver is present, and modifies how the ThinkPad -firmware will behave in many situations. - -The driver enables the HKEY ("hot key") event reporting automatically -when loaded, and disables it when it is removed. - -The driver will report HKEY events in the following format:: - - ibm/hotkey HKEY 00000080 0000xxxx - -Some of these events refer to hot key presses, but not all of them. - -The driver will generate events over the input layer for hot keys and -radio switches, and over the ACPI netlink layer for other events. The -input layer support accepts the standard IOCTLs to remap the keycodes -assigned to each hot key. - -The hot key bit mask allows some control over which hot keys generate -events. If a key is "masked" (bit set to 0 in the mask), the firmware -will handle it. If it is "unmasked", it signals the firmware that -thinkpad-acpi would prefer to handle it, if the firmware would be so -kind to allow it (and it often doesn't!). - -Not all bits in the mask can be modified. Not all bits that can be -modified do anything. Not all hot keys can be individually controlled -by the mask. Some models do not support the mask at all. The behaviour -of the mask is, therefore, highly dependent on the ThinkPad model. - -The driver will filter out any unmasked hotkeys, so even if the firmware -doesn't allow disabling an specific hotkey, the driver will not report -events for unmasked hotkeys. - -Note that unmasking some keys prevents their default behavior. For -example, if Fn+F5 is unmasked, that key will no longer enable/disable -Bluetooth by itself in firmware. - -Note also that not all Fn key combinations are supported through ACPI -depending on the ThinkPad model and firmware version. On those -ThinkPads, it is still possible to support some extra hotkeys by -polling the "CMOS NVRAM" at least 10 times per second. The driver -attempts to enables this functionality automatically when required. - -procfs notes -^^^^^^^^^^^^ - -The following commands can be written to the /proc/acpi/ibm/hotkey file:: - - echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys - echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys - ... any other 8-hex-digit mask ... - echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask - -The following commands have been deprecated and will cause the kernel -to log a warning:: - - echo enable > /proc/acpi/ibm/hotkey -- does nothing - echo disable > /proc/acpi/ibm/hotkey -- returns an error - -The procfs interface does not support NVRAM polling control. So as to -maintain maximum bug-to-bug compatibility, it does not report any masks, -nor does it allow one to manipulate the hot key mask when the firmware -does not support masks at all, even if NVRAM polling is in use. - -sysfs notes -^^^^^^^^^^^ - - hotkey_bios_enabled: - DEPRECATED, WILL BE REMOVED SOON. - - Returns 0. - - hotkey_bios_mask: - DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE. - - Returns the hot keys mask when thinkpad-acpi was loaded. - Upon module unload, the hot keys mask will be restored - to this value. This is always 0x80c, because those are - the hotkeys that were supported by ancient firmware - without mask support. - - hotkey_enable: - DEPRECATED, WILL BE REMOVED SOON. - - 0: returns -EPERM - 1: does nothing - - hotkey_mask: - bit mask to enable reporting (and depending on - the firmware, ACPI event generation) for each hot key - (see above). Returns the current status of the hot keys - mask, and allows one to modify it. - - hotkey_all_mask: - bit mask that should enable event reporting for all - supported hot keys, when echoed to hotkey_mask above. - Unless you know which events need to be handled - passively (because the firmware *will* handle them - anyway), do *not* use hotkey_all_mask. Use - hotkey_recommended_mask, instead. You have been warned. - - hotkey_recommended_mask: - bit mask that should enable event reporting for all - supported hot keys, except those which are always - handled by the firmware anyway. Echo it to - hotkey_mask above, to use. This is the default mask - used by the driver. - - hotkey_source_mask: - bit mask that selects which hot keys will the driver - poll the NVRAM for. This is auto-detected by the driver - based on the capabilities reported by the ACPI firmware, - but it can be overridden at runtime. - - Hot keys whose bits are set in hotkey_source_mask are - polled for in NVRAM, and reported as hotkey events if - enabled in hotkey_mask. Only a few hot keys are - available through CMOS NVRAM polling. - - Warning: when in NVRAM mode, the volume up/down/mute - keys are synthesized according to changes in the mixer, - which uses a single volume up or volume down hotkey - press to unmute, as per the ThinkPad volume mixer user - interface. When in ACPI event mode, volume up/down/mute - events are reported by the firmware and can behave - differently (and that behaviour changes with firmware - version -- not just with firmware models -- as well as - OSI(Linux) state). - - hotkey_poll_freq: - frequency in Hz for hot key polling. It must be between - 0 and 25 Hz. Polling is only carried out when strictly - needed. - - Setting hotkey_poll_freq to zero disables polling, and - will cause hot key presses that require NVRAM polling - to never be reported. - - Setting hotkey_poll_freq too low may cause repeated - pressings of the same hot key to be misreported as a - single key press, or to not even be detected at all. - The recommended polling frequency is 10Hz. - - hotkey_radio_sw: - If the ThinkPad has a hardware radio switch, this - attribute will read 0 if the switch is in the "radios - disabled" position, and 1 if the switch is in the - "radios enabled" position. - - This attribute has poll()/select() support. - - hotkey_tablet_mode: - If the ThinkPad has tablet capabilities, this attribute - will read 0 if the ThinkPad is in normal mode, and - 1 if the ThinkPad is in tablet mode. - - This attribute has poll()/select() support. - - wakeup_reason: - Set to 1 if the system is waking up because the user - requested a bay ejection. Set to 2 if the system is - waking up because the user requested the system to - undock. Set to zero for normal wake-ups or wake-ups - due to unknown reasons. - - This attribute has poll()/select() support. - - wakeup_hotunplug_complete: - Set to 1 if the system was waken up because of an - undock or bay ejection request, and that request - was successfully completed. At this point, it might - be useful to send the system back to sleep, at the - user's choice. Refer to HKEY events 0x4003 and - 0x3003, below. - - This attribute has poll()/select() support. - -input layer notes -^^^^^^^^^^^^^^^^^ - -A Hot key is mapped to a single input layer EV_KEY event, possibly -followed by an EV_MSC MSC_SCAN event that shall contain that key's scan -code. An EV_SYN event will always be generated to mark the end of the -event block. - -Do not use the EV_MSC MSC_SCAN events to process keys. They are to be -used as a helper to remap keys, only. They are particularly useful when -remapping KEY_UNKNOWN keys. - -The events are available in an input device, with the following id: - - ============== ============================== - Bus BUS_HOST - vendor 0x1014 (PCI_VENDOR_ID_IBM) or - 0x17aa (PCI_VENDOR_ID_LENOVO) - product 0x5054 ("TP") - version 0x4101 - ============== ============================== - -The version will have its LSB incremented if the keymap changes in a -backwards-compatible way. The MSB shall always be 0x41 for this input -device. If the MSB is not 0x41, do not use the device as described in -this section, as it is either something else (e.g. another input device -exported by a thinkpad driver, such as HDAPS) or its functionality has -been changed in a non-backwards compatible way. - -Adding other event types for other functionalities shall be considered a -backwards-compatible change for this input device. - -Thinkpad-acpi Hot Key event map (version 0x4101): - -======= ======= ============== ============================================== -ACPI Scan -event code Key Notes -======= ======= ============== ============================================== -0x1001 0x00 FN+F1 - - -0x1002 0x01 FN+F2 IBM: battery (rare) - Lenovo: Screen lock - -0x1003 0x02 FN+F3 Many IBM models always report - this hot key, even with hot keys - disabled or with Fn+F3 masked - off - IBM: screen lock, often turns - off the ThinkLight as side-effect - Lenovo: battery - -0x1004 0x03 FN+F4 Sleep button (ACPI sleep button - semantics, i.e. sleep-to-RAM). - It always generates some kind - of event, either the hot key - event or an ACPI sleep button - event. The firmware may - refuse to generate further FN+F4 - key presses until a S3 or S4 ACPI - sleep cycle is performed or some - time passes. - -0x1005 0x04 FN+F5 Radio. Enables/disables - the internal Bluetooth hardware - and W-WAN card if left in control - of the firmware. Does not affect - the WLAN card. - Should be used to turn on/off all - radios (Bluetooth+W-WAN+WLAN), - really. - -0x1006 0x05 FN+F6 - - -0x1007 0x06 FN+F7 Video output cycle. - Do you feel lucky today? - -0x1008 0x07 FN+F8 IBM: toggle screen expand - Lenovo: configure UltraNav, - or toggle screen expand - -0x1009 0x08 FN+F9 - - -... ... ... ... - -0x100B 0x0A FN+F11 - - -0x100C 0x0B FN+F12 Sleep to disk. You are always - supposed to handle it yourself, - either through the ACPI event, - or through a hotkey event. - The firmware may refuse to - generate further FN+F12 key - press events until a S3 or S4 - ACPI sleep cycle is performed, - or some time passes. - -0x100D 0x0C FN+BACKSPACE - -0x100E 0x0D FN+INSERT - -0x100F 0x0E FN+DELETE - - -0x1010 0x0F FN+HOME Brightness up. This key is - always handled by the firmware - in IBM ThinkPads, even when - unmasked. Just leave it alone. - For Lenovo ThinkPads with a new - BIOS, it has to be handled either - by the ACPI OSI, or by userspace. - The driver does the right thing, - never mess with this. -0x1011 0x10 FN+END Brightness down. See brightness - up for details. - -0x1012 0x11 FN+PGUP ThinkLight toggle. This key is - always handled by the firmware, - even when unmasked. - -0x1013 0x12 FN+PGDOWN - - -0x1014 0x13 FN+SPACE Zoom key - -0x1015 0x14 VOLUME UP Internal mixer volume up. This - key is always handled by the - firmware, even when unmasked. - NOTE: Lenovo seems to be changing - this. -0x1016 0x15 VOLUME DOWN Internal mixer volume up. This - key is always handled by the - firmware, even when unmasked. - NOTE: Lenovo seems to be changing - this. -0x1017 0x16 MUTE Mute internal mixer. This - key is always handled by the - firmware, even when unmasked. - -0x1018 0x17 THINKPAD ThinkPad/Access IBM/Lenovo key - -0x1019 0x18 unknown - -... ... ... - -0x1020 0x1F unknown -======= ======= ============== ============================================== - -The ThinkPad firmware does not allow one to differentiate when most hot -keys are pressed or released (either that, or we don't know how to, yet). -For these keys, the driver generates a set of events for a key press and -immediately issues the same set of events for a key release. It is -unknown by the driver if the ThinkPad firmware triggered these events on -hot key press or release, but the firmware will do it for either one, not -both. - -If a key is mapped to KEY_RESERVED, it generates no input events at all. -If a key is mapped to KEY_UNKNOWN, it generates an input event that -includes an scan code. If a key is mapped to anything else, it will -generate input device EV_KEY events. - -In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW -events for switches: - -============== ============================================== -SW_RFKILL_ALL T60 and later hardware rfkill rocker switch -SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A -============== ============================================== - -Non hotkey ACPI HKEY event map ------------------------------- - -Events that are never propagated by the driver: - -====== ================================================== -0x2304 System is waking up from suspend to undock -0x2305 System is waking up from suspend to eject bay -0x2404 System is waking up from hibernation to undock -0x2405 System is waking up from hibernation to eject bay -0x5001 Lid closed -0x5002 Lid opened -0x5009 Tablet swivel: switched to tablet mode -0x500A Tablet swivel: switched to normal mode -0x5010 Brightness level changed/control event -0x6000 KEYBOARD: Numlock key pressed -0x6005 KEYBOARD: Fn key pressed (TO BE VERIFIED) -0x7000 Radio Switch may have changed state -====== ================================================== - - -Events that are propagated by the driver to userspace: - -====== ===================================================== -0x2313 ALARM: System is waking up from suspend because - the battery is nearly empty -0x2413 ALARM: System is waking up from hibernation because - the battery is nearly empty -0x3003 Bay ejection (see 0x2x05) complete, can sleep again -0x3006 Bay hotplug request (hint to power up SATA link when - the optical drive tray is ejected) -0x4003 Undocked (see 0x2x04), can sleep again -0x4010 Docked into hotplug port replicator (non-ACPI dock) -0x4011 Undocked from hotplug port replicator (non-ACPI dock) -0x500B Tablet pen inserted into its storage bay -0x500C Tablet pen removed from its storage bay -0x6011 ALARM: battery is too hot -0x6012 ALARM: battery is extremely hot -0x6021 ALARM: a sensor is too hot -0x6022 ALARM: a sensor is extremely hot -0x6030 System thermal table changed -0x6032 Thermal Control command set completion (DYTC, Windows) -0x6040 Nvidia Optimus/AC adapter related (TO BE VERIFIED) -0x60C0 X1 Yoga 2016, Tablet mode status changed -0x60F0 Thermal Transformation changed (GMTS, Windows) -====== ===================================================== - -Battery nearly empty alarms are a last resort attempt to get the -operating system to hibernate or shutdown cleanly (0x2313), or shutdown -cleanly (0x2413) before power is lost. They must be acted upon, as the -wake up caused by the firmware will have negated most safety nets... - -When any of the "too hot" alarms happen, according to Lenovo the user -should suspend or hibernate the laptop (and in the case of battery -alarms, unplug the AC adapter) to let it cool down. These alarms do -signal that something is wrong, they should never happen on normal -operating conditions. - -The "extremely hot" alarms are emergencies. According to Lenovo, the -operating system is to force either an immediate suspend or hibernate -cycle, or a system shutdown. Obviously, something is very wrong if this -happens. - - -Brightness hotkey notes -^^^^^^^^^^^^^^^^^^^^^^^ - -Don't mess with the brightness hotkeys in a Thinkpad. If you want -notifications for OSD, use the sysfs backlight class event support. - -The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events -automatically for the cases were userspace has to do something to -implement brightness changes. When you override these events, you will -either fail to handle properly the ThinkPads that require explicit -action to change backlight brightness, or the ThinkPads that require -that no action be taken to work properly. - - -Bluetooth ---------- - -procfs: /proc/acpi/ibm/bluetooth - -sysfs device attribute: bluetooth_enable (deprecated) - -sysfs rfkill class: switch "tpacpi_bluetooth_sw" - -This feature shows the presence and current state of a ThinkPad -Bluetooth device in the internal ThinkPad CDC slot. - -If the ThinkPad supports it, the Bluetooth state is stored in NVRAM, -so it is kept across reboots and power-off. - -Procfs notes -^^^^^^^^^^^^ - -If Bluetooth is installed, the following commands can be used:: - - echo enable > /proc/acpi/ibm/bluetooth - echo disable > /proc/acpi/ibm/bluetooth - -Sysfs notes -^^^^^^^^^^^ - - If the Bluetooth CDC card is installed, it can be enabled / - disabled through the "bluetooth_enable" thinkpad-acpi device - attribute, and its current status can also be queried. - - enable: - - - 0: disables Bluetooth / Bluetooth is disabled - - 1: enables Bluetooth / Bluetooth is enabled. - - Note: this interface has been superseded by the generic rfkill - class. It has been deprecated, and it will be removed in year - 2010. - - rfkill controller switch "tpacpi_bluetooth_sw": refer to - Documentation/rfkill.txt for details. - - -Video output control -- /proc/acpi/ibm/video --------------------------------------------- - -This feature allows control over the devices used for video output - -LCD, CRT or DVI (if available). The following commands are available:: - - echo lcd_enable > /proc/acpi/ibm/video - echo lcd_disable > /proc/acpi/ibm/video - echo crt_enable > /proc/acpi/ibm/video - echo crt_disable > /proc/acpi/ibm/video - echo dvi_enable > /proc/acpi/ibm/video - echo dvi_disable > /proc/acpi/ibm/video - echo auto_enable > /proc/acpi/ibm/video - echo auto_disable > /proc/acpi/ibm/video - echo expand_toggle > /proc/acpi/ibm/video - echo video_switch > /proc/acpi/ibm/video - -NOTE: - Access to this feature is restricted to processes owning the - CAP_SYS_ADMIN capability for safety reasons, as it can interact badly - enough with some versions of X.org to crash it. - -Each video output device can be enabled or disabled individually. -Reading /proc/acpi/ibm/video shows the status of each device. - -Automatic video switching can be enabled or disabled. When automatic -video switching is enabled, certain events (e.g. opening the lid, -docking or undocking) cause the video output device to change -automatically. While this can be useful, it also causes flickering -and, on the X40, video corruption. By disabling automatic switching, -the flickering or video corruption can be avoided. - -The video_switch command cycles through the available video outputs -(it simulates the behavior of Fn-F7). - -Video expansion can be toggled through this feature. This controls -whether the display is expanded to fill the entire LCD screen when a -mode with less than full resolution is used. Note that the current -video expansion status cannot be determined through this feature. - -Note that on many models (particularly those using Radeon graphics -chips) the X driver configures the video card in a way which prevents -Fn-F7 from working. This also disables the video output switching -features of this driver, as it uses the same ACPI methods as -Fn-F7. Video switching on the console should still work. - -UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000 - - -ThinkLight control ------------------- - -procfs: /proc/acpi/ibm/light - -sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED - -procfs notes -^^^^^^^^^^^^ - -The ThinkLight status can be read and set through the procfs interface. A -few models which do not make the status available will show the ThinkLight -status as "unknown". The available commands are:: - - echo on > /proc/acpi/ibm/light - echo off > /proc/acpi/ibm/light - -sysfs notes -^^^^^^^^^^^ - -The ThinkLight sysfs interface is documented by the LED class -documentation, in Documentation/leds/leds-class.rst. The ThinkLight LED name -is "tpacpi::thinklight". - -Due to limitations in the sysfs LED class, if the status of the ThinkLight -cannot be read or if it is unknown, thinkpad-acpi will report it as "off". -It is impossible to know if the status returned through sysfs is valid. - - -CMOS/UCMS control ------------------ - -procfs: /proc/acpi/ibm/cmos - -sysfs device attribute: cmos_command - -This feature is mostly used internally by the ACPI firmware to keep the legacy -CMOS NVRAM bits in sync with the current machine state, and to record this -state so that the ThinkPad will retain such settings across reboots. - -Some of these commands actually perform actions in some ThinkPad models, but -this is expected to disappear more and more in newer models. As an example, in -a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for -real, but commands 0 to 2 don't control the mixer anymore (they have been -phased out) and just update the NVRAM. - -The range of valid cmos command numbers is 0 to 21, but not all have an -effect and the behavior varies from model to model. Here is the behavior -on the X40 (tpb is the ThinkPad Buttons utility): - - - 0 - Related to "Volume down" key press - - 1 - Related to "Volume up" key press - - 2 - Related to "Mute on" key press - - 3 - Related to "Access IBM" key press - - 4 - Related to "LCD brightness up" key press - - 5 - Related to "LCD brightness down" key press - - 11 - Related to "toggle screen expansion" key press/function - - 12 - Related to "ThinkLight on" - - 13 - Related to "ThinkLight off" - - 14 - Related to "ThinkLight" key press (toggle ThinkLight) - -The cmos command interface is prone to firmware split-brain problems, as -in newer ThinkPads it is just a compatibility layer. Do not use it, it is -exported just as a debug tool. - - -LED control ------------ - -procfs: /proc/acpi/ibm/led -sysfs attributes: as per LED class, see below for names - -Some of the LED indicators can be controlled through this feature. On -some older ThinkPad models, it is possible to query the status of the -LED indicators as well. Newer ThinkPads cannot query the real status -of the LED indicators. - -Because misuse of the LEDs could induce an unaware user to perform -dangerous actions (like undocking or ejecting a bay device while the -buses are still active), or mask an important alarm (such as a nearly -empty battery, or a broken battery), access to most LEDs is -restricted. - -Unrestricted access to all LEDs requires that thinkpad-acpi be -compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled. -Distributions must never enable this option. Individual users that -are aware of the consequences are welcome to enabling it. - -Audio mute and microphone mute LEDs are supported, but currently not -visible to userspace. They are used by the snd-hda-intel audio driver. - -procfs notes -^^^^^^^^^^^^ - -The available commands are:: - - echo ' on' >/proc/acpi/ibm/led - echo ' off' >/proc/acpi/ibm/led - echo ' blink' >/proc/acpi/ibm/led - -The range is 0 to 15. The set of LEDs that can be -controlled varies from model to model. Here is the common ThinkPad -mapping: - - - 0 - power - - 1 - battery (orange) - - 2 - battery (green) - - 3 - UltraBase/dock - - 4 - UltraBay - - 5 - UltraBase battery slot - - 6 - (unknown) - - 7 - standby - - 8 - dock status 1 - - 9 - dock status 2 - - 10, 11 - (unknown) - - 12 - thinkvantage - - 13, 14, 15 - (unknown) - -All of the above can be turned on and off and can be made to blink. - -sysfs notes -^^^^^^^^^^^ - -The ThinkPad LED sysfs interface is described in detail by the LED class -documentation, in Documentation/leds/leds-class.rst. - -The LEDs are named (in LED ID order, from 0 to 12): -"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt", -"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt", -"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1", -"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3", -"tpacpi::thinkvantage". - -Due to limitations in the sysfs LED class, if the status of the LED -indicators cannot be read due to an error, thinkpad-acpi will report it as -a brightness of zero (same as LED off). - -If the thinkpad firmware doesn't support reading the current status, -trying to read the current LED brightness will just return whatever -brightness was last written to that attribute. - -These LEDs can blink using hardware acceleration. To request that a -ThinkPad indicator LED should blink in hardware accelerated mode, use the -"timer" trigger, and leave the delay_on and delay_off parameters set to -zero (to request hardware acceleration autodetection). - -LEDs that are known not to exist in a given ThinkPad model are not -made available through the sysfs interface. If you have a dock and you -notice there are LEDs listed for your ThinkPad that do not exist (and -are not in the dock), or if you notice that there are missing LEDs, -a report to ibm-acpi-devel@lists.sourceforge.net is appreciated. - - -ACPI sounds -- /proc/acpi/ibm/beep ----------------------------------- - -The BEEP method is used internally by the ACPI firmware to provide -audible alerts in various situations. This feature allows the same -sounds to be triggered manually. - -The commands are non-negative integer numbers:: - - echo >/proc/acpi/ibm/beep - -The valid range is 0 to 17. Not all numbers trigger sounds -and the sounds vary from model to model. Here is the behavior on the -X40: - - - 0 - stop a sound in progress (but use 17 to stop 16) - - 2 - two beeps, pause, third beep ("low battery") - - 3 - single beep - - 4 - high, followed by low-pitched beep ("unable") - - 5 - single beep - - 6 - very high, followed by high-pitched beep ("AC/DC") - - 7 - high-pitched beep - - 9 - three short beeps - - 10 - very long beep - - 12 - low-pitched beep - - 15 - three high-pitched beeps repeating constantly, stop with 0 - - 16 - one medium-pitched beep repeating constantly, stop with 17 - - 17 - stop 16 - - -Temperature sensors -------------------- - -procfs: /proc/acpi/ibm/thermal - -sysfs device attributes: (hwmon "thinkpad") temp*_input - -Most ThinkPads include six or more separate temperature sensors but only -expose the CPU temperature through the standard ACPI methods. This -feature shows readings from up to eight different sensors on older -ThinkPads, and up to sixteen different sensors on newer ThinkPads. - -For example, on the X40, a typical output may be: - -temperatures: - 42 42 45 41 36 -128 33 -128 - -On the T43/p, a typical output may be: - -temperatures: - 48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128 - -The mapping of thermal sensors to physical locations varies depending on -system-board model (and thus, on ThinkPad model). - -http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that -tries to track down these locations for various models. - -Most (newer?) models seem to follow this pattern: - -- 1: CPU -- 2: (depends on model) -- 3: (depends on model) -- 4: GPU -- 5: Main battery: main sensor -- 6: Bay battery: main sensor -- 7: Main battery: secondary sensor -- 8: Bay battery: secondary sensor -- 9-15: (depends on model) - -For the R51 (source: Thomas Gruber): - -- 2: Mini-PCI -- 3: Internal HDD - -For the T43, T43/p (source: Shmidoax/Thinkwiki.org) -http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p - -- 2: System board, left side (near PCMCIA slot), reported as HDAPS temp -- 3: PCMCIA slot -- 9: MCH (northbridge) to DRAM Bus -- 10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI - card, under touchpad -- 11: Power regulator, underside of system board, below F2 key - -The A31 has a very atypical layout for the thermal sensors -(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31) - -- 1: CPU -- 2: Main Battery: main sensor -- 3: Power Converter -- 4: Bay Battery: main sensor -- 5: MCH (northbridge) -- 6: PCMCIA/ambient -- 7: Main Battery: secondary sensor -- 8: Bay Battery: secondary sensor - - -Procfs notes -^^^^^^^^^^^^ - - Readings from sensors that are not available return -128. - No commands can be written to this file. - -Sysfs notes -^^^^^^^^^^^ - - Sensors that are not available return the ENXIO error. This - status may change at runtime, as there are hotplug thermal - sensors, like those inside the batteries and docks. - - thinkpad-acpi thermal sensors are reported through the hwmon - subsystem, and follow all of the hwmon guidelines at - Documentation/hwmon. - -EXPERIMENTAL: Embedded controller register dump ------------------------------------------------ - -This feature is not included in the thinkpad driver anymore. -Instead the EC can be accessed through /sys/kernel/debug/ec with -a userspace tool which can be found here: -ftp://ftp.suse.com/pub/people/trenn/sources/ec - -Use it to determine the register holding the fan -speed on some models. To do that, do the following: - - - make sure the battery is fully charged - - make sure the fan is running - - use above mentioned tool to read out the EC - -Often fan and temperature values vary between -readings. Since temperatures don't change vary fast, you can take -several quick dumps to eliminate them. - -You can use a similar method to figure out the meaning of other -embedded controller registers - e.g. make sure nothing else changes -except the charging or discharging battery to determine which -registers contain the current battery capacity, etc. If you experiment -with this, do send me your results (including some complete dumps with -a description of the conditions when they were taken.) - - -LCD brightness control ----------------------- - -procfs: /proc/acpi/ibm/brightness - -sysfs backlight device "thinkpad_screen" - -This feature allows software control of the LCD brightness on ThinkPad -models which don't have a hardware brightness slider. - -It has some limitations: the LCD backlight cannot be actually turned -on or off by this interface, it just controls the backlight brightness -level. - -On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control -has eight brightness levels, ranging from 0 to 7. Some of the levels -may not be distinct. Later Lenovo models that implement the ACPI -display backlight brightness control methods have 16 levels, ranging -from 0 to 15. - -For IBM ThinkPads, there are two interfaces to the firmware for direct -brightness control, EC and UCMS (or CMOS). To select which one should be -used, use the brightness_mode module parameter: brightness_mode=1 selects -EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC -mode with NVRAM backing (so that brightness changes are remembered across -shutdown/reboot). - -The driver tries to select which interface to use from a table of -defaults for each ThinkPad model. If it makes a wrong choice, please -report this as a bug, so that we can fix it. - -Lenovo ThinkPads only support brightness_mode=2 (UCMS). - -When display backlight brightness controls are available through the -standard ACPI interface, it is best to use it instead of this direct -ThinkPad-specific interface. The driver will disable its native -backlight brightness control interface if it detects that the standard -ACPI interface is available in the ThinkPad. - -If you want to use the thinkpad-acpi backlight brightness control -instead of the generic ACPI video backlight brightness control for some -reason, you should use the acpi_backlight=vendor kernel parameter. - -The brightness_enable module parameter can be used to control whether -the LCD brightness control feature will be enabled when available. -brightness_enable=0 forces it to be disabled. brightness_enable=1 -forces it to be enabled when available, even if the standard ACPI -interface is also available. - -Procfs notes -^^^^^^^^^^^^ - -The available commands are:: - - echo up >/proc/acpi/ibm/brightness - echo down >/proc/acpi/ibm/brightness - echo 'level ' >/proc/acpi/ibm/brightness - -Sysfs notes -^^^^^^^^^^^ - -The interface is implemented through the backlight sysfs class, which is -poorly documented at this time. - -Locate the thinkpad_screen device under /sys/class/backlight, and inside -it there will be the following attributes: - - max_brightness: - Reads the maximum brightness the hardware can be set to. - The minimum is always zero. - - actual_brightness: - Reads what brightness the screen is set to at this instant. - - brightness: - Writes request the driver to change brightness to the - given value. Reads will tell you what brightness the - driver is trying to set the display to when "power" is set - to zero and the display has not been dimmed by a kernel - power management event. - - power: - power management mode, where 0 is "display on", and 1 to 3 - will dim the display backlight to brightness level 0 - because thinkpad-acpi cannot really turn the backlight - off. Kernel power management events can temporarily - increase the current power management level, i.e. they can - dim the display. - - -WARNING: - - Whatever you do, do NOT ever call thinkpad-acpi backlight-level change - interface and the ACPI-based backlight level change interface - (available on newer BIOSes, and driven by the Linux ACPI video driver) - at the same time. The two will interact in bad ways, do funny things, - and maybe reduce the life of the backlight lamps by needlessly kicking - its level up and down at every change. - - -Volume control (Console Audio control) --------------------------------------- - -procfs: /proc/acpi/ibm/volume - -ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC" - -NOTE: by default, the volume control interface operates in read-only -mode, as it is supposed to be used for on-screen-display purposes. -The read/write mode can be enabled through the use of the -"volume_control=1" module parameter. - -NOTE: distros are urged to not enable volume_control by default, this -should be done by the local admin only. The ThinkPad UI is for the -console audio control to be done through the volume keys only, and for -the desktop environment to just provide on-screen-display feedback. -Software volume control should be done only in the main AC97/HDA -mixer. - - -About the ThinkPad Console Audio control -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -ThinkPads have a built-in amplifier and muting circuit that drives the -console headphone and speakers. This circuit is after the main AC97 -or HDA mixer in the audio path, and under exclusive control of the -firmware. - -ThinkPads have three special hotkeys to interact with the console -audio control: volume up, volume down and mute. - -It is worth noting that the normal way the mute function works (on -ThinkPads that do not have a "mute LED") is: - -1. Press mute to mute. It will *always* mute, you can press it as - many times as you want, and the sound will remain mute. - -2. Press either volume key to unmute the ThinkPad (it will _not_ - change the volume, it will just unmute). - -This is a very superior design when compared to the cheap software-only -mute-toggle solution found on normal consumer laptops: you can be -absolutely sure the ThinkPad will not make noise if you press the mute -button, no matter the previous state. - -The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain -amplifiers driving the speakers and headphone output, and the firmware -also handles volume control for the headphone and speakers on these -ThinkPads without any help from the operating system (this volume -control stage exists after the main AC97 or HDA mixer in the audio -path). - -The newer Lenovo models only have firmware mute control, and depend on -the main HDA mixer to do volume control (which is done by the operating -system). In this case, the volume keys are filtered out for unmute -key press (there are some firmware bugs in this area) and delivered as -normal key presses to the operating system (thinkpad-acpi is not -involved). - - -The ThinkPad-ACPI volume control -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The preferred way to interact with the Console Audio control is the -ALSA interface. - -The legacy procfs interface allows one to read the current state, -and if volume control is enabled, accepts the following commands:: - - echo up >/proc/acpi/ibm/volume - echo down >/proc/acpi/ibm/volume - echo mute >/proc/acpi/ibm/volume - echo unmute >/proc/acpi/ibm/volume - echo 'level ' >/proc/acpi/ibm/volume - -The number range is 0 to 14 although not all of them may be -distinct. To unmute the volume after the mute command, use either the -up or down command (the level command will not unmute the volume), or -the unmute command. - -You can use the volume_capabilities parameter to tell the driver -whether your thinkpad has volume control or mute-only control: -volume_capabilities=1 for mixers with mute and volume control, -volume_capabilities=2 for mixers with only mute control. - -If the driver misdetects the capabilities for your ThinkPad model, -please report this to ibm-acpi-devel@lists.sourceforge.net, so that we -can update the driver. - -There are two strategies for volume control. To select which one -should be used, use the volume_mode module parameter: volume_mode=1 -selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing -(so that volume/mute changes are remembered across shutdown/reboot). - -The driver will operate in volume_mode=3 by default. If that does not -work well on your ThinkPad model, please report this to -ibm-acpi-devel@lists.sourceforge.net. - -The driver supports the standard ALSA module parameters. If the ALSA -mixer is disabled, the driver will disable all volume functionality. - - -Fan control and monitoring: fan speed, fan enable/disable ---------------------------------------------------------- - -procfs: /proc/acpi/ibm/fan - -sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, pwm1_enable, fan2_input - -sysfs hwmon driver attributes: fan_watchdog - -NOTE NOTE NOTE: - fan control operations are disabled by default for - safety reasons. To enable them, the module parameter "fan_control=1" - must be given to thinkpad-acpi. - -This feature attempts to show the current fan speed, control mode and -other fan data that might be available. The speed is read directly -from the hardware registers of the embedded controller. This is known -to work on later R, T, X and Z series ThinkPads but may show a bogus -value on other models. - -Some Lenovo ThinkPads support a secondary fan. This fan cannot be -controlled separately, it shares the main fan control. - -Fan levels -^^^^^^^^^^ - -Most ThinkPad fans work in "levels" at the firmware interface. Level 0 -stops the fan. The higher the level, the higher the fan speed, although -adjacent levels often map to the same fan speed. 7 is the highest -level, where the fan reaches the maximum recommended speed. - -Level "auto" means the EC changes the fan level according to some -internal algorithm, usually based on readings from the thermal sensors. - -There is also a "full-speed" level, also known as "disengaged" level. -In this level, the EC disables the speed-locked closed-loop fan control, -and drives the fan as fast as it can go, which might exceed hardware -limits, so use this level with caution. - -The fan usually ramps up or down slowly from one speed to another, and -it is normal for the EC to take several seconds to react to fan -commands. The full-speed level may take up to two minutes to ramp up to -maximum speed, and in some ThinkPads, the tachometer readings go stale -while the EC is transitioning to the full-speed level. - -WARNING WARNING WARNING: do not leave the fan disabled unless you are -monitoring all of the temperature sensor readings and you are ready to -enable it if necessary to avoid overheating. - -An enabled fan in level "auto" may stop spinning if the EC decides the -ThinkPad is cool enough and doesn't need the extra airflow. This is -normal, and the EC will spin the fan up if the various thermal readings -rise too much. - -On the X40, this seems to depend on the CPU and HDD temperatures. -Specifically, the fan is turned on when either the CPU temperature -climbs to 56 degrees or the HDD temperature climbs to 46 degrees. The -fan is turned off when the CPU temperature drops to 49 degrees and the -HDD temperature drops to 41 degrees. These thresholds cannot -currently be controlled. - -The ThinkPad's ACPI DSDT code will reprogram the fan on its own when -certain conditions are met. It will override any fan programming done -through thinkpad-acpi. - -The thinkpad-acpi kernel driver can be programmed to revert the fan -level to a safe setting if userspace does not issue one of the procfs -fan commands: "enable", "disable", "level" or "watchdog", or if there -are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is -set to 1, manual mode) within a configurable amount of time of up to -120 seconds. This functionality is called fan safety watchdog. - -Note that the watchdog timer stops after it enables the fan. It will be -rearmed again automatically (using the same interval) when one of the -above mentioned fan commands is received. The fan watchdog is, -therefore, not suitable to protect against fan mode changes made through -means other than the "enable", "disable", and "level" procfs fan -commands, or the hwmon fan control sysfs interface. - -Procfs notes -^^^^^^^^^^^^ - -The fan may be enabled or disabled with the following commands:: - - echo enable >/proc/acpi/ibm/fan - echo disable >/proc/acpi/ibm/fan - -Placing a fan on level 0 is the same as disabling it. Enabling a fan -will try to place it in a safe level if it is too slow or disabled. - -The fan level can be controlled with the command:: - - echo 'level ' > /proc/acpi/ibm/fan - -Where is an integer from 0 to 7, or one of the words "auto" or -"full-speed" (without the quotes). Not all ThinkPads support the "auto" -and "full-speed" levels. The driver accepts "disengaged" as an alias for -"full-speed", and reports it as "disengaged" for backwards -compatibility. - -On the X31 and X40 (and ONLY on those models), the fan speed can be -controlled to a certain degree. Once the fan is running, it can be -forced to run faster or slower with the following command:: - - echo 'speed ' > /proc/acpi/ibm/fan - -The sustainable range of fan speeds on the X40 appears to be from about -3700 to about 7350. Values outside this range either do not have any -effect or the fan speed eventually settles somewhere in that range. The -fan cannot be stopped or started with this command. This functionality -is incomplete, and not available through the sysfs interface. - -To program the safety watchdog, use the "watchdog" command:: - - echo 'watchdog ' > /proc/acpi/ibm/fan - -If you want to disable the watchdog, use 0 as the interval. - -Sysfs notes -^^^^^^^^^^^ - -The sysfs interface follows the hwmon subsystem guidelines for the most -part, and the exception is the fan safety watchdog. - -Writes to any of the sysfs attributes may return the EINVAL error if -that operation is not supported in a given ThinkPad or if the parameter -is out-of-bounds, and EPERM if it is forbidden. They may also return -EINTR (interrupted system call), and EIO (I/O error while trying to talk -to the firmware). - -Features not yet implemented by the driver return ENOSYS. - -hwmon device attribute pwm1_enable: - - 0: PWM offline (fan is set to full-speed mode) - - 1: Manual PWM control (use pwm1 to set fan level) - - 2: Hardware PWM control (EC "auto" mode) - - 3: reserved (Software PWM control, not implemented yet) - - Modes 0 and 2 are not supported by all ThinkPads, and the - driver is not always able to detect this. If it does know a - mode is unsupported, it will return -EINVAL. - -hwmon device attribute pwm1: - Fan level, scaled from the firmware values of 0-7 to the hwmon - scale of 0-255. 0 means fan stopped, 255 means highest normal - speed (level 7). - - This attribute only commands the fan if pmw1_enable is set to 1 - (manual PWM control). - -hwmon device attribute fan1_input: - Fan tachometer reading, in RPM. May go stale on certain - ThinkPads while the EC transitions the PWM to offline mode, - which can take up to two minutes. May return rubbish on older - ThinkPads. - -hwmon device attribute fan2_input: - Fan tachometer reading, in RPM, for the secondary fan. - Available only on some ThinkPads. If the secondary fan is - not installed, will always read 0. - -hwmon driver attribute fan_watchdog: - Fan safety watchdog timer interval, in seconds. Minimum is - 1 second, maximum is 120 seconds. 0 disables the watchdog. - -To stop the fan: set pwm1 to zero, and pwm1_enable to 1. - -To start the fan in a safe mode: set pwm1_enable to 2. If that fails -with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255 -would be the safest choice, though). - - -WAN ---- - -procfs: /proc/acpi/ibm/wan - -sysfs device attribute: wwan_enable (deprecated) - -sysfs rfkill class: switch "tpacpi_wwan_sw" - -This feature shows the presence and current state of the built-in -Wireless WAN device. - -If the ThinkPad supports it, the WWAN state is stored in NVRAM, -so it is kept across reboots and power-off. - -It was tested on a Lenovo ThinkPad X60. It should probably work on other -ThinkPad models which come with this module installed. - -Procfs notes -^^^^^^^^^^^^ - -If the W-WAN card is installed, the following commands can be used:: - - echo enable > /proc/acpi/ibm/wan - echo disable > /proc/acpi/ibm/wan - -Sysfs notes -^^^^^^^^^^^ - - If the W-WAN card is installed, it can be enabled / - disabled through the "wwan_enable" thinkpad-acpi device - attribute, and its current status can also be queried. - - enable: - - 0: disables WWAN card / WWAN card is disabled - - 1: enables WWAN card / WWAN card is enabled. - - Note: this interface has been superseded by the generic rfkill - class. It has been deprecated, and it will be removed in year - 2010. - - rfkill controller switch "tpacpi_wwan_sw": refer to - Documentation/rfkill.txt for details. - - -EXPERIMENTAL: UWB ------------------ - -This feature is considered EXPERIMENTAL because it has not been extensively -tested and validated in various ThinkPad models yet. The feature may not -work as expected. USE WITH CAUTION! To use this feature, you need to supply -the experimental=1 parameter when loading the module. - -sysfs rfkill class: switch "tpacpi_uwb_sw" - -This feature exports an rfkill controller for the UWB device, if one is -present and enabled in the BIOS. - -Sysfs notes -^^^^^^^^^^^ - - rfkill controller switch "tpacpi_uwb_sw": refer to - Documentation/rfkill.txt for details. - -Adaptive keyboard ------------------ - -sysfs device attribute: adaptive_kbd_mode - -This sysfs attribute controls the keyboard "face" that will be shown on the -Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read -and set. - -- 1 = Home mode -- 2 = Web-browser mode -- 3 = Web-conference mode -- 4 = Function mode -- 5 = Layflat mode - -For more details about which buttons will appear depending on the mode, please -review the laptop's user guide: -http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf - -Multiple Commands, Module Parameters ------------------------------------- - -Multiple commands can be written to the proc files in one shot by -separating them with commas, for example:: - - echo enable,0xffff > /proc/acpi/ibm/hotkey - echo lcd_disable,crt_enable > /proc/acpi/ibm/video - -Commands can also be specified when loading the thinkpad-acpi module, -for example:: - - modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable - - -Enabling debugging output -------------------------- - -The module takes a debug parameter which can be used to selectively -enable various classes of debugging output, for example:: - - modprobe thinkpad_acpi debug=0xffff - -will enable all debugging output classes. It takes a bitmask, so -to enable more than one output class, just add their values. - - ============= ====================================== - Debug bitmask Description - ============= ====================================== - 0x8000 Disclose PID of userspace programs - accessing some functions of the driver - 0x0001 Initialization and probing - 0x0002 Removal - 0x0004 RF Transmitter control (RFKILL) - (bluetooth, WWAN, UWB...) - 0x0008 HKEY event interface, hotkeys - 0x0010 Fan control - 0x0020 Backlight brightness - 0x0040 Audio mixer/volume control - ============= ====================================== - -There is also a kernel build option to enable more debugging -information, which may be necessary to debug driver problems. - -The level of debugging information output by the driver can be changed -at runtime through sysfs, using the driver attribute debug_level. The -attribute takes the same bitmask as the debug module parameter above. - - -Force loading of module ------------------------ - -If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify -the module parameter force_load=1. Regardless of whether this works or -not, please contact ibm-acpi-devel@lists.sourceforge.net with a report. - - -Sysfs interface changelog -^^^^^^^^^^^^^^^^^^^^^^^^^ - -========= =============================================================== -0x000100: Initial sysfs support, as a single platform driver and - device. -0x000200: Hot key support for 32 hot keys, and radio slider switch - support. -0x010000: Hot keys are now handled by default over the input - layer, the radio switch generates input event EV_RADIO, - and the driver enables hot key handling by default in - the firmware. - -0x020000: ABI fix: added a separate hwmon platform device and - driver, which must be located by name (thinkpad) - and the hwmon class for libsensors4 (lm-sensors 3) - compatibility. Moved all hwmon attributes to this - new platform device. - -0x020100: Marker for thinkpad-acpi with hot key NVRAM polling - support. If you must, use it to know you should not - start a userspace NVRAM poller (allows to detect when - NVRAM is compiled out by the user because it is - unneeded/undesired in the first place). -0x020101: Marker for thinkpad-acpi with hot key NVRAM polling - and proper hotkey_mask semantics (version 8 of the - NVRAM polling patch). Some development snapshots of - 0.18 had an earlier version that did strange things - to hotkey_mask. - -0x020200: Add poll()/select() support to the following attributes: - hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason - -0x020300: hotkey enable/disable support removed, attributes - hotkey_bios_enabled and hotkey_enable deprecated and - marked for removal. - -0x020400: Marker for 16 LEDs support. Also, LEDs that are known - to not exist in a given model are not registered with - the LED sysfs class anymore. - -0x020500: Updated hotkey driver, hotkey_mask is always available - and it is always able to disable hot keys. Very old - thinkpads are properly supported. hotkey_bios_mask - is deprecated and marked for removal. - -0x020600: Marker for backlight change event support. - -0x020700: Support for mute-only mixers. - Volume control in read-only mode by default. - Marker for ALSA mixer support. - -0x030000: Thermal and fan sysfs attributes were moved to the hwmon - device instead of being attached to the backing platform - device. -========= =============================================================== diff --git a/Documentation/laptops/toshiba_haps.rst b/Documentation/laptops/toshiba_haps.rst deleted file mode 100644 index 11dfc428c080..000000000000 --- a/Documentation/laptops/toshiba_haps.rst +++ /dev/null @@ -1,87 +0,0 @@ -==================================== -Toshiba HDD Active Protection Sensor -==================================== - -Kernel driver: toshiba_haps - -Author: Azael Avalos - - -.. 0. Contents - - 1. Description - 2. Interface - 3. Accelerometer axes - 4. Supported devices - 5. Usage - - -1. Description --------------- - -This driver provides support for the accelerometer found in various Toshiba -laptops, being called "Toshiba HDD Protection - Shock Sensor" officially, -and detects laptops automatically with this device. -On Windows, Toshiba provided software monitors this device and provides -automatic HDD protection (head unload) on sudden moves or harsh vibrations, -however, this driver only provides a notification via a sysfs file to let -userspace tools or daemons act accordingly, as well as providing a sysfs -file to set the desired protection level or sensor sensibility. - - -2. Interface ------------- - -This device comes with 3 methods: - -==== ===================================================================== -_STA Checks existence of the device, returning Zero if the device does not - exists or is not supported. -PTLV Sets the desired protection level. -RSSS Shuts down the HDD protection interface for a few seconds, - then restores normal operation. -==== ===================================================================== - -Note: - The presence of Solid State Drives (SSD) can make this driver to fail loading, - given the fact that such drives have no movable parts, and thus, not requiring - any "protection" as well as failing during the evaluation of the _STA method - found under this device. - - -3. Accelerometer axes ---------------------- - -This device does not report any axes, however, to query the sensor position -a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are -provided to query such information, handled by the kernel module toshiba_acpi -since kernel version 3.15. - - -4. Supported devices --------------------- - -This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop -with this device is supported, given the fact that they have the presence of -conventional HDD and not only SSD, or a combination of both HDD and SSD. - - -5. Usage --------- - -The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are: - -================ ============================================================ -protection_level The protection_level is readable and writeable, and - provides a way to let userspace query the current protection - level, as well as set the desired protection level, the - available protection levels are: - - ============ ======= ========== ======== - 0 - Disabled 1 - Low 2 - Medium 3 - High - ============ ======= ========== ======== - -reset_protection The reset_protection entry is writeable only, being "1" - the only parameter it accepts, it is used to trigger - a reset of the protection interface. -================ ============================================================ -- cgit From 330d48105245abfb8c9ca491dc53ea500657217a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 15:21:39 -0300 Subject: docs: admin-guide: add kdump documentation into it The Kdump documentation describes procedures with admins use in order to solve issues on their systems. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/bug-hunting.rst | 4 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kdump/gdbmacros.txt | 264 +++++++++++ Documentation/admin-guide/kdump/index.rst | 20 + Documentation/admin-guide/kdump/kdump.rst | 534 ++++++++++++++++++++++ Documentation/admin-guide/kdump/vmcoreinfo.rst | 488 ++++++++++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/kdump/gdbmacros.txt | 264 ----------- Documentation/kdump/index.rst | 21 - Documentation/kdump/kdump.rst | 534 ---------------------- Documentation/kdump/vmcoreinfo.rst | 488 -------------------- Documentation/powerpc/firmware-assisted-dump.txt | 2 +- Documentation/translations/zh_CN/oops-tracing.txt | 4 +- 13 files changed, 1315 insertions(+), 1315 deletions(-) create mode 100644 Documentation/admin-guide/kdump/gdbmacros.txt create mode 100644 Documentation/admin-guide/kdump/index.rst create mode 100644 Documentation/admin-guide/kdump/kdump.rst create mode 100644 Documentation/admin-guide/kdump/vmcoreinfo.rst delete mode 100644 Documentation/kdump/gdbmacros.txt delete mode 100644 Documentation/kdump/index.rst delete mode 100644 Documentation/kdump/kdump.rst delete mode 100644 Documentation/kdump/vmcoreinfo.rst (limited to 'Documentation') diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index b761aa2a51d2..44b8a4edd348 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -90,9 +90,9 @@ the disk is not available then you have three options: run a null modem to a second machine and capture the output there using your favourite communication program. Minicom works well. -(3) Use Kdump (see Documentation/kdump/kdump.rst), +(3) Use Kdump (see Documentation/admin-guide/kdump/kdump.rst), extract the kernel ring buffer from old memory with using dmesg - gdbmacro in Documentation/kdump/gdbmacros.txt. + gdbmacro in Documentation/admin-guide/kdump/gdbmacros.txt. Finding the bug's location -------------------------- diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 6fcc83aaa9b6..5b63182ceb5f 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -39,6 +39,7 @@ problems and bugs in particular. ramoops dynamic-debug-howto init + kdump/index perf/index This is the beginning of a section with information of interest to diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt new file mode 100644 index 000000000000..220d0a80ca2c --- /dev/null +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -0,0 +1,264 @@ +# +# This file contains a few gdb macros (user defined commands) to extract +# useful information from kernel crashdump (kdump) like stack traces of +# all the processes or a particular process and trapinfo. +# +# These macros can be used by copying this file in .gdbinit (put in home +# directory or current directory) or by invoking gdb command with +# --command= option +# +# Credits: +# Alexander Nyberg +# V Srivatsa +# Maneesh Soni +# + +define bttnobp + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $stacksize = sizeof(union thread_union) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document bttnobp + dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER +end + +define btthreadstack + set var $pid_task = $arg0 + + printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm + printf "task struct: " + print $pid_task + printf "===================\n" + set var $stackp = $pid_task.thread.sp + set var $stacksize = sizeof(union thread_union) + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + set var $stack_bot = ($stackp & ~($stacksize - 1)) + + set $stackp = *((unsigned long *) $stackp) + while (($stackp < $stack_top) && ($stackp > $stack_bot)) + set var $addr = *(((unsigned long *) $stackp) + 1) + info symbol $addr + set $stackp = *((unsigned long *) $stackp) + end +end +document btthreadstack + dump a thread stack using the given task structure pointer +end + + +define btt + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + btthreadstack $next_t + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + btthreadstack $next_th + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document btt + dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER +end + +define btpid + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + btthreadstack $pid_task +end +document btpid + backtrace of pid +end + + +define trapinfo + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ + $pid_task.thread.cr2, $pid_task.thread.error_code + +end +document trapinfo + Run info threads and lookup pid of thread #1 + 'trapinfo ' will tell you by which trap & possibly + address the kernel panicked. +end + +define dump_log_idx + set $idx = $arg0 + if ($argc > 1) + set $prev_flags = $arg1 + else + set $prev_flags = 0 + end + set $msg = ((struct printk_log *) (log_buf + $idx)) + set $prefix = 1 + set $newline = 1 + set $log = log_buf + $idx + sizeof(*$msg) + + # prev & LOG_CONT && !(msg->flags & LOG_PREIX) + if (($prev_flags & 8) && !($msg->flags & 4)) + set $prefix = 0 + end + + # msg->flags & LOG_CONT + if ($msg->flags & 8) + # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) + if (($prev_flags & 8) && !($prev_flags & 2)) + set $prefix = 0 + end + # (!(msg->flags & LOG_NEWLINE)) + if (!($msg->flags & 2)) + set $newline = 0 + end + end + + if ($prefix) + printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 + end + if ($msg->text_len != 0) + eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len + end + if ($newline) + printf "\n" + end + if ($msg->dict_len > 0) + set $dict = $log + $msg->text_len + set $idx = 0 + set $line = 1 + while ($idx < $msg->dict_len) + if ($line) + printf " " + set $line = 0 + end + set $c = $dict[$idx] + if ($c == '\0') + printf "\n" + set $line = 1 + else + if ($c < ' ' || $c >= 127 || $c == '\\') + printf "\\x%02x", $c + else + printf "%c", $c + end + end + set $idx = $idx + 1 + end + printf "\n" + end +end +document dump_log_idx + Dump a single log given its index in the log buffer. The first + parameter is the index into log_buf, the second is optional and + specified the previous log buffer's flags, used for properly + formatting continued lines. +end + +define dmesg + set $i = log_first_idx + set $end_idx = log_first_idx + set $prev_flags = 0 + + while (1) + set $msg = ((struct printk_log *) (log_buf + $i)) + if ($msg->len == 0) + set $i = 0 + else + dump_log_idx $i $prev_flags + set $i = $i + $msg->len + set $prev_flags = $msg->flags + end + if ($i == $end_idx) + loop_break + end + end +end +document dmesg + print the kernel ring buffer +end diff --git a/Documentation/admin-guide/kdump/index.rst b/Documentation/admin-guide/kdump/index.rst new file mode 100644 index 000000000000..8e2ebd0383cd --- /dev/null +++ b/Documentation/admin-guide/kdump/index.rst @@ -0,0 +1,20 @@ + +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +.. toctree:: + :maxdepth: 1 + + kdump + vmcoreinfo + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst new file mode 100644 index 000000000000..ac7e131d2935 --- /dev/null +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -0,0 +1,534 @@ +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +Overview +======== + +Kdump uses kexec to quickly boot to a dump-capture kernel whenever a +dump of the system kernel's memory needs to be taken (for example, when +the system panics). The system kernel's memory image is preserved across +the reboot and is accessible to the dump-capture kernel. + +You can use common commands, such as cp and scp, to copy the +memory image to a dump file on the local disk, or across the network to +a remote system. + +Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, +s390x, arm and arm64 architectures. + +When the system kernel boots, it reserves a small section of memory for +the dump-capture kernel. This ensures that ongoing Direct Memory Access +(DMA) from the system kernel does not corrupt the dump-capture kernel. +The kexec -p command loads the dump-capture kernel into this reserved +memory. + +On x86 machines, the first 640 KB of physical memory is needed to boot, +regardless of where the kernel loads. Therefore, kexec backs up this +region just before rebooting into the dump-capture kernel. + +Similarly on PPC64 machines first 32KB of physical memory is needed for +booting regardless of where the kernel is loaded and to support 64K page +size kexec backs up the first 64KB memory. + +For s390x, when kdump is triggered, the crashkernel region is exchanged +with the region [0, crashkernel region size] and then the kdump kernel +runs in [0, crashkernel region size]. Therefore no relocatable kernel is +needed for s390x. + +All of the necessary information about the system kernel's core image is +encoded in the ELF format, and stored in a reserved area of memory +before a crash. The physical address of the start of the ELF header is +passed to the dump-capture kernel through the elfcorehdr= boot +parameter. Optionally the size of the ELF header can also be passed +when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. + + +With the dump-capture kernel, you can access the memory image through +/proc/vmcore. This exports the dump as an ELF-format file that you can +write out using file copy commands such as cp or scp. Further, you can +use analysis tools such as the GNU Debugger (GDB) and the Crash tool to +debug the dump file. This method ensures that the dump pages are correctly +ordered. + + +Setup and Installation +====================== + +Install kexec-tools +------------------- + +1) Login as the root user. + +2) Download the kexec-tools user-space package from the following URL: + +http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz + +This is a symlink to the latest version. + +The latest kexec-tools git tree is available at: + +- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git +- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git + +There is also a gitweb interface available at +http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git + +More information about kexec-tools can be found at +http://horms.net/projects/kexec/ + +3) Unpack the tarball with the tar command, as follows:: + + tar xvpzf kexec-tools.tar.gz + +4) Change to the kexec-tools directory, as follows:: + + cd kexec-tools-VERSION + +5) Configure the package, as follows:: + + ./configure + +6) Compile the package, as follows:: + + make + +7) Install the package, as follows:: + + make install + + +Build the system and dump-capture kernels +----------------------------------------- +There are two possible methods of using Kdump. + +1) Build a separate custom dump-capture kernel for capturing the + kernel core dump. + +2) Or use the system kernel binary itself as dump-capture kernel and there is + no need to build a separate dump-capture kernel. This is possible + only with the architectures which support a relocatable kernel. As + of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support + relocatable kernel. + +Building a relocatable kernel is advantageous from the point of view that +one does not have to build a second kernel for capturing the dump. But +at the same time one might want to build a custom dump capture kernel +suitable to his needs. + +Following are the configuration setting required for system and +dump-capture kernels for enabling kdump support. + +System kernel config options +---------------------------- + +1) Enable "kexec system call" in "Processor type and features.":: + + CONFIG_KEXEC=y + +2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo + filesystems." This is usually enabled by default:: + + CONFIG_SYSFS=y + + Note that "sysfs file system support" might not appear in the "Pseudo + filesystems" menu if "Configure standard kernel features (for small + systems)" is not enabled in "General Setup." In this case, check the + .config file itself to ensure that sysfs is turned on, as follows:: + + grep 'CONFIG_SYSFS' .config + +3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: + + CONFIG_DEBUG_INFO=Y + + This causes the kernel to be built with debug symbols. The dump + analysis tools require a vmlinux with debug symbols in order to read + and analyze a dump file. + +Dump-capture kernel config options (Arch Independent) +----------------------------------------------------- + +1) Enable "kernel crash dumps" support under "Processor type and + features":: + + CONFIG_CRASH_DUMP=y + +2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: + + CONFIG_PROC_VMCORE=y + + (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) + +Dump-capture kernel config options (Arch Dependent, i386 and x86_64) +-------------------------------------------------------------------- + +1) On i386, enable high memory support under "Processor type and + features":: + + CONFIG_HIGHMEM64G=y + + or:: + + CONFIG_HIGHMEM4G + +2) On i386 and x86_64, disable symmetric multi-processing support + under "Processor type and features":: + + CONFIG_SMP=n + + (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line + when loading the dump-capture kernel, see section "Load the Dump-capture + Kernel".) + +3) If one wants to build and use a relocatable kernel, + Enable "Build a relocatable kernel" support under "Processor type and + features":: + + CONFIG_RELOCATABLE=y + +4) Use a suitable value for "Physical address where the kernel is + loaded" (under "Processor type and features"). This only appears when + "kernel crash dumps" is enabled. A suitable value depends upon + whether kernel is relocatable or not. + + If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 + This will compile the kernel for physical address 1MB, but given the fact + kernel is relocatable, it can be run from any physical address hence + kexec boot loader will load it in memory region reserved for dump-capture + kernel. + + Otherwise it should be the start of memory region reserved for + second kernel using boot parameter "crashkernel=Y@X". Here X is + start of memory region reserved for dump-capture kernel. + Generally X is 16MB (0x1000000). So you can set + CONFIG_PHYSICAL_START=0x1000000 + +5) Make and install the kernel and its modules. DO NOT add this kernel + to the boot loader configuration files. + +Dump-capture kernel config options (Arch Dependent, ppc64) +---------------------------------------------------------- + +1) Enable "Build a kdump crash kernel" support under "Kernel" options:: + + CONFIG_CRASH_DUMP=y + +2) Enable "Build a relocatable kernel" support:: + + CONFIG_RELOCATABLE=y + + Make and install the kernel and its modules. + +Dump-capture kernel config options (Arch Dependent, ia64) +---------------------------------------------------------- + +- No specific options are required to create a dump-capture kernel + for ia64, other than those specified in the arch independent section + above. This means that it is possible to use the system kernel + as a dump-capture kernel if desired. + + The crashkernel region can be automatically placed by the system + kernel at run time. This is done by specifying the base address as 0, + or omitting it all together:: + + crashkernel=256M@0 + + or:: + + crashkernel=256M + + If the start address is specified, note that the start address of the + kernel will be aligned to 64Mb, so if the start address is not then + any space below the alignment point will be wasted. + +Dump-capture kernel config options (Arch Dependent, arm) +---------------------------------------------------------- + +- To use a relocatable kernel, + Enable "AUTO_ZRELADDR" support under "Boot" options:: + + AUTO_ZRELADDR=y + +Dump-capture kernel config options (Arch Dependent, arm64) +---------------------------------------------------------- + +- Please note that kvm of the dump-capture kernel will not be enabled + on non-VHE systems even if it is configured. This is because the CPU + will not be reset to EL2 on panic. + +Extended crashkernel syntax +=========================== + +While the "crashkernel=size[@offset]" syntax is sufficient for most +configurations, sometimes it's handy to have the reserved memory dependent +on the value of System RAM -- that's mostly for distributors that pre-setup +the kernel command line to avoid a unbootable system after some memory has +been removed from the machine. + +The syntax is:: + + crashkernel=:[,:,...][@offset] + range=start-[end] + +For example:: + + crashkernel=512M-2G:64M,2G-:128M + +This would mean: + + 1) if the RAM is smaller than 512M, then don't reserve anything + (this is the "rescue" case) + 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M + 3) if the RAM size is larger than 2G, then reserve 128M + + + +Boot into System Kernel +======================= + +1) Update the boot loader (such as grub, yaboot, or lilo) configuration + files as necessary. + +2) Boot the system kernel with the boot parameter "crashkernel=Y@X", + where Y specifies how much memory to reserve for the dump-capture kernel + and X specifies the beginning of this reserved memory. For example, + "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory + starting at physical address 0x01000000 (16MB) for the dump-capture kernel. + + On x86 and x86_64, use "crashkernel=64M@16M". + + On ppc64, use "crashkernel=128M@32M". + + On ia64, 256M@256M is a generous value that typically works. + The region may be automatically placed on ia64, see the + dump-capture kernel config option notes above. + If use sparse memory, the size should be rounded to GRANULE boundaries. + + On s390x, typically use "crashkernel=xxM". The value of xx is dependent + on the memory consumption of the kdump system. In general this is not + dependent on the memory size of the production system. + + On arm, the use of "crashkernel=Y@X" is no longer necessary; the + kernel will automatically locate the crash kernel image within the + first 512MB of RAM if X is not given. + + On arm64, use "crashkernel=Y[@X]". Note that the start address of + the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). + +Load the Dump-capture Kernel +============================ + +After booting to the system kernel, dump-capture kernel needs to be +loaded. + +Based on the architecture and type of image (relocatable or not), one +can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz +of dump-capture kernel. Following is the summary. + +For i386 and x86_64: + + - Use vmlinux if kernel is not relocatable. + - Use bzImage/vmlinuz if kernel is relocatable. + +For ppc64: + + - Use vmlinux + +For ia64: + + - Use vmlinux or vmlinuz.gz + +For s390x: + + - Use image or bzImage + +For arm: + + - Use zImage + +For arm64: + + - Use vmlinux or Image + +If you are using an uncompressed vmlinux image then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= --args-linux \ + --append="root= " + +If you are using a compressed bzImage/vmlinuz, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +If you are using a compressed zImage, then use following command +to load dump-capture kernel:: + + kexec --type zImage -p \ + --initrd= \ + --dtb= \ + --append="root= " + +If you are using an uncompressed Image, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +Please note, that --args-linux does not need to be specified for ia64. +It is planned to make this a no-op on that architecture, but for now +it should be omitted + +Following are the arch specific command line options to be used while +loading dump-capture kernel. + +For i386, x86_64 and ia64: + + "1 irqpoll maxcpus=1 reset_devices" + +For ppc64: + + "1 maxcpus=1 noirqdistrib reset_devices" + +For s390x: + + "1 maxcpus=1 cgroup_disable=memory" + +For arm: + + "1 maxcpus=1 reset_devices" + +For arm64: + + "1 maxcpus=1 reset_devices" + +Notes on loading the dump-capture kernel: + +* By default, the ELF headers are stored in ELF64 format to support + systems with more than 4GB memory. On i386, kexec automatically checks if + the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. + So, on non-PAE systems, ELF32 is always used. + + The --elf32-core-headers option can be used to force the generation of ELF32 + headers. This is necessary because GDB currently cannot open vmcore files + with ELF64 headers on 32-bit systems. + +* The "irqpoll" boot parameter reduces driver initialization failures + due to shared interrupts in the dump-capture kernel. + +* You must specify in the format corresponding to the root + device name in the output of mount command. + +* Boot parameter "1" boots the dump-capture kernel into single-user + mode without networking. If you want networking, use "3". + +* We generally don't have to bring up a SMP kernel just to capture the + dump. Hence generally it is useful either to build a UP dump-capture + kernel or specify maxcpus=1 option while loading dump-capture kernel. + Note, though maxcpus always works, you had better replace it with + nr_cpus to save memory if supported by the current ARCH, such as x86. + +* You should enable multi-cpu support in dump-capture kernel if you intend + to use multi-thread programs with it, such as parallel dump feature of + makedumpfile. Otherwise, the multi-thread program may have a great + performance degradation. To enable multi-cpu support, you should bring up an + SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] + options while loading it. + +* For s390x there are two kdump modes: If a ELF header is specified with + the elfcorehdr= kernel parameter, it is used by the kdump kernel as it + is done on all other architectures. If no elfcorehdr= kernel parameter is + specified, the s390x kdump kernel dynamically creates the header. The + second mode has the advantage that for CPU and memory hotplug, kdump has + not to be reloaded with kexec_load(). + +* For s390x systems with many attached devices the "cio_ignore" kernel + parameter should be used for the kdump kernel in order to prevent allocation + of kernel memory for devices that are not relevant for kdump. The same + applies to systems that use SCSI/FCP devices. In that case the + "allow_lun_scan" zfcp module parameter should be set to zero before + setting FCP devices online. + +Kernel Panic +============ + +After successfully loading the dump-capture kernel as previously +described, the system will reboot into the dump-capture kernel if a +system crash is triggered. Trigger points are located in panic(), +die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). + +The following conditions will execute a crash trigger point: + +If a hard lockup is detected and "NMI watchdog" is configured, the system +will boot into the dump-capture kernel ( die_nmi() ). + +If die() is called, and it happens to be a thread with pid 0 or 1, or die() +is called inside interrupt context or die() is called and panic_on_oops is set, +the system will boot into the dump-capture kernel. + +On powerpc systems when a soft-reset is generated, die() is called by all cpus +and the system will boot into the dump-capture kernel. + +For testing purposes, you can trigger a crash by using "ALT-SysRq-c", +"echo c > /proc/sysrq-trigger" or write a module to force the panic. + +Write Out the Dump File +======================= + +After the dump-capture kernel is booted, write out the dump file with +the following command:: + + cp /proc/vmcore + + +Analysis +======== + +Before analyzing the dump image, you should reboot into a stable kernel. + +You can do limited analysis using GDB on the dump file copied out of +/proc/vmcore. Use the debug vmlinux built with -g and run the following +command:: + + gdb vmlinux + +Stack trace for the task on processor 0, register display, and memory +display work fine. + +Note: GDB cannot analyze core files generated in ELF64 format for x86. +On systems with a maximum of 4GB of memory, you can generate +ELF32-format headers using the --elf32-core-headers kernel option on the +dump kernel. + +You can also use the Crash utility to analyze dump files in Kdump +format. Crash is available on Dave Anderson's site at the following URL: + + http://people.redhat.com/~anderson/ + +Trigger Kdump on WARN() +======================= + +The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This +will cause a kdump to occur at the panic() call. In cases where a user wants +to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 +to achieve the same behaviour. + +Contact +======= + +- Vivek Goyal (vgoyal@redhat.com) +- Maneesh Soni (maneesh@in.ibm.com) + +GDB macros +========== + +.. include:: gdbmacros.txt + :literal: diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst new file mode 100644 index 000000000000..007a6b86e0ee --- /dev/null +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -0,0 +1,488 @@ +========== +VMCOREINFO +========== + +What is it? +=========== + +VMCOREINFO is a special ELF note section. It contains various +information from the kernel like structure size, page size, symbol +values, field offsets, etc. These data are packed into an ELF note +section and used by user-space tools like crash and makedumpfile to +analyze a kernel's memory layout. + +Common variables +================ + +init_uts_ns.name.release +------------------------ + +The version of the Linux kernel. Used to find the corresponding source +code from which the kernel has been built. For example, crash uses it to +find the corresponding vmlinux in order to process vmcore. + +PAGE_SIZE +--------- + +The size of a page. It is the smallest unit of data used by the memory +management facilities. It is usually 4096 bytes of size and a page is +aligned on 4096 bytes. Used for computing page addresses. + +init_uts_ns +----------- + +The UTS namespace which is used to isolate two specific elements of the +system that relate to the uname(2) system call. It is named after the +data structure used to store information returned by the uname(2) system +call. + +User-space tools can get the kernel name, host name, kernel release +number, kernel version, architecture name and OS type from it. + +node_online_map +--------------- + +An array node_states[N_ONLINE] which represents the set of online nodes +in a system, one bit position per node number. Used to keep track of +which nodes are in the system and online. + +swapper_pg_dir +-------------- + +The global page directory pointer of the kernel. Used to translate +virtual to physical addresses. + +_stext +------ + +Defines the beginning of the text section. In general, _stext indicates +the kernel start address. Used to convert a virtual address from the +direct kernel map to a physical address. + +vmap_area_list +-------------- + +Stores the virtual area list. makedumpfile gets the vmalloc start value +from this variable and its value is necessary for vmalloc translation. + +mem_map +------- + +Physical addresses are translated to struct pages by treating them as +an index into the mem_map array. Right-shifting a physical address +PAGE_SHIFT bits converts it into a page frame number which is an index +into that mem_map array. + +Used to map an address to the corresponding struct page. + +contig_page_data +---------------- + +Makedumpfile gets the pglist_data structure from this symbol, which is +used to describe the memory layout. + +User-space tools use this to exclude free pages when dumping memory. + +mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) +-------------------------------------------------------------------------- + +The address of the mem_section array, its length, structure size, and +the section_mem_map offset. + +It exists in the sparse memory mapping model, and it is also somewhat +similar to the mem_map variable, both of them are used to translate an +address. + +page +---- + +The size of a page structure. struct page is an important data structure +and it is widely used to compute contiguous memory. + +pglist_data +----------- + +The size of a pglist_data structure. This value is used to check if the +pglist_data structure is valid. It is also used for checking the memory +type. + +zone +---- + +The size of a zone structure. This value is used to check if the zone +structure has been found. It is also used for excluding free pages. + +free_area +--------- + +The size of a free_area structure. It indicates whether the free_area +structure is valid or not. Useful when excluding free pages. + +list_head +--------- + +The size of a list_head structure. Used when iterating lists in a +post-mortem analysis session. + +nodemask_t +---------- + +The size of a nodemask_t type. Used to compute the number of online +nodes. + +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) +------------------------------------------------------------------------------------------------- + +User-space tools compute their values based on the offset of these +variables. The variables are used when excluding unnecessary pages. + +(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) +----------------------------------------------------------------------------------------- + +On NUMA machines, each NUMA node has a pg_data_t to describe its memory +layout. On UMA machines there is a single pglist_data which describes the +whole memory. + +These values are used to check the memory type and to compute the +virtual address for memory map. + +(zone, free_area|vm_stat|spanned_pages) +--------------------------------------- + +Each node is divided into a number of blocks called zones which +represent ranges within memory. A zone is described by a structure zone. + +User-space tools compute required values based on the offset of these +variables. + +(free_area, free_list) +---------------------- + +Offset of the free_list's member. This value is used to compute the number +of free pages. + +Each zone has a free_area structure array called free_area[MAX_ORDER]. +The free_list represents a linked list of free page blocks. + +(list_head, next|prev) +---------------------- + +Offsets of the list_head's members. list_head is used to define a +circular linked list. User-space tools need these in order to traverse +lists. + +(vmap_area, va_start|list) +-------------------------- + +Offsets of the vmap_area's members. They carry vmalloc-specific +information. Makedumpfile gets the start address of the vmalloc region +from this. + +(zone.free_area, MAX_ORDER) +--------------------------- + +Free areas descriptor. User-space tools use this value to iterate the +free_area ranges. MAX_ORDER is used by the zone buddy allocator. + +log_first_idx +------------- + +Index of the first record stored in the buffer log_buf. Used by +user-space tools to read the strings in the log_buf. + +log_buf +------- + +Console output is written to the ring buffer log_buf at index +log_first_idx. Used to get the kernel log. + +log_buf_len +----------- + +log_buf's length. + +clear_idx +--------- + +The index that the next printk() record to read after the last clear +command. It indicates the first record after the last SYSLOG_ACTION +_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump +the dmesg log. + +log_next_idx +------------ + +The index of the next record to store in the buffer log_buf. Used to +compute the index of the current buffer position. + +printk_log +---------- + +The size of a structure printk_log. Used to compute the size of +messages, and extract dmesg log. It encapsulates header information for +log_buf, such as timestamp, syslog level, etc. + +(printk_log, ts_nsec|len|text_len|dict_len) +------------------------------------------- + +It represents field offsets in struct printk_log. User space tools +parse it and check whether the values of printk_log's members have been +changed. + +(free_area.free_list, MIGRATE_TYPES) +------------------------------------ + +The number of migrate types for pages. The free_list is described by the +array. Used by tools to compute the number of free pages. + +NR_FREE_PAGES +------------- + +On linux-2.6.21 or later, the number of free pages is in +vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. + +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask +------------------------------------------------------------------------------ + +Page attributes. These flags are used to filter various unnecessary for +dumping pages. + +PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) +----------------------------------------------------------------------------- + +More page attributes. These flags are used to filter various unnecessary for +dumping pages. + + +HUGETLB_PAGE_DTOR +----------------- + +The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile +excludes these pages. + +x86_64 +====== + +phys_base +--------- + +Used to convert the virtual address of an exported kernel symbol to its +corresponding physical address. + +init_top_pgt +------------ + +Used to walk through the whole page table and convert virtual addresses +to physical addresses. The init_top_pgt is somewhat similar to +swapper_pg_dir, but it is only used in x86_64. + +pgtable_l5_enabled +------------------ + +User-space tools need to know whether the crash kernel was in 5-level +paging mode. + +node_data +--------- + +This is a struct pglist_data array and stores all NUMA nodes +information. Makedumpfile gets the pglist_data structure from it. + +(node_data, MAX_NUMNODES) +------------------------- + +The maximum number of nodes in system. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +KERNEL_IMAGE_SIZE +----------------- + +Currently unused by Makedumpfile. Used to compute the module virtual +address by Crash. + +sme_mask +-------- + +AMD-specific with SME support: it indicates the secure memory encryption +mask. Makedumpfile tools need to know whether the crash kernel was +encrypted. If SME is enabled in the first kernel, the crash kernel's +page table entries (pgd/pud/pmd/pte) contain the memory encryption +mask. This is used to remove the SME mask and obtain the true physical +address. + +Currently, sme_mask stores the value of the C-bit position. If needed, +additional SME-relevant info can be placed in that variable. + +For example:: + + [ misc ][ enc bit ][ other misc SME info ] + 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 + 63 59 55 51 47 43 39 35 31 27 ... 3 + +x86_32 +====== + +X86_PAE +------- + +Denotes whether physical address extensions are enabled. It has the cost +of a higher page table lookup overhead, and also consumes more page +table space per process. Used to check whether PAE was enabled in the +crash kernel when converting virtual addresses to physical addresses. + +ia64 +==== + +pgdat_list|(pgdat_list, MAX_NUMNODES) +------------------------------------- + +pg_data_t array storing all NUMA nodes information. MAX_NUMNODES +indicates the number of the nodes. + +node_memblk|(node_memblk, NR_NODE_MEMBLKS) +------------------------------------------ + +List of node memory chunks. Filled when parsing the SRAT table to obtain +information about memory nodes. NR_NODE_MEMBLKS indicates the number of +node memory chunks. + +These values are used to compute the number of nodes the crashed kernel used. + +node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) +---------------------------------------------------------------- + +The size of a struct node_memblk_s and the offsets of the +node_memblk_s's members. Used to compute the number of nodes. + +PGTABLE_3|PGTABLE_4 +------------------- + +User-space tools need to know whether the crash kernel was in 3-level or +4-level paging mode. Used to distinguish the page table. + +ARM64 +===== + +VA_BITS +------- + +The maximum number of bits for virtual addresses. Used to compute the +virtual memory ranges. + +kimage_voffset +-------------- + +The offset between the kernel virtual and physical mappings. Used to +translate virtual to physical addresses. + +PHYS_OFFSET +----------- + +Indicates the physical address of the start of memory. Similar to +kimage_voffset, which is used to translate virtual to physical +addresses. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +arm +=== + +ARM_LPAE +-------- + +It indicates whether the crash kernel supports large physical address +extensions. Used to translate virtual to physical addresses. + +s390 +==== + +lowcore_ptr +----------- + +An array with a pointer to the lowcore of every CPU. Used to print the +psw and all registers information. + +high_memory +----------- + +Used to get the vmalloc_start address from the high_memory symbol. + +(lowcore_ptr, NR_CPUS) +---------------------- + +The maximum number of CPUs. + +powerpc +======= + + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +contig_page_data +---------------- + +See above. + +vmemmap_list +------------ + +The vmemmap_list maintains the entire vmemmap physical mapping. Used +to get vmemmap list count and populated vmemmap regions info. If the +vmemmap address translation information is stored in the crash kernel, +it is used to translate vmemmap kernel virtual addresses. + +mmu_vmemmap_psize +----------------- + +The size of a page. Used to translate virtual to physical addresses. + +mmu_psize_defs +-------------- + +Page size definitions, i.e. 4k, 64k, or 16M. + +Used to make vtop translations. + +vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) +-------------------------------------------------------------------------------------------- + +The vmemmap virtual address space management does not have a traditional +page table to track which virtual struct pages are backed by a physical +mapping. The virtual to physical mappings are tracked in a simple linked +list format. + +User-space tools need to know the offset of list, phys and virt_addr +when computing the count of vmemmap regions. + +mmu_psize_def|(mmu_psize_def, shift) +------------------------------------ + +The size of a struct mmu_psize_def and the offset of mmu_psize_def's +member. + +Used in vtop translations. + +sh +== + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +X2TLB +----- + +Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4821175a3769..e645b3ab4b6f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -708,14 +708,14 @@ [KNL, x86_64] select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. - See Documentation/kdump/kdump.rst for further details. + See Documentation/admin-guide/kdump/kdump.rst for further details. crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is start-[end] where start and end are both a memory unit (amount[KMG]). See also - Documentation/kdump/kdump.rst for an example. + Documentation/admin-guide/kdump/kdump.rst for an example. crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel @@ -1207,7 +1207,7 @@ Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. - See Documentation/kdump/kdump.rst for details. + See Documentation/admin-guide/kdump/kdump.rst for details. enable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/kdump/gdbmacros.txt deleted file mode 100644 index 220d0a80ca2c..000000000000 --- a/Documentation/kdump/gdbmacros.txt +++ /dev/null @@ -1,264 +0,0 @@ -# -# This file contains a few gdb macros (user defined commands) to extract -# useful information from kernel crashdump (kdump) like stack traces of -# all the processes or a particular process and trapinfo. -# -# These macros can be used by copying this file in .gdbinit (put in home -# directory or current directory) or by invoking gdb command with -# --command= option -# -# Credits: -# Alexander Nyberg -# V Srivatsa -# Maneesh Soni -# - -define bttnobp - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $stacksize = sizeof(union thread_union) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document bttnobp - dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER -end - -define btthreadstack - set var $pid_task = $arg0 - - printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm - printf "task struct: " - print $pid_task - printf "===================\n" - set var $stackp = $pid_task.thread.sp - set var $stacksize = sizeof(union thread_union) - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - set var $stack_bot = ($stackp & ~($stacksize - 1)) - - set $stackp = *((unsigned long *) $stackp) - while (($stackp < $stack_top) && ($stackp > $stack_bot)) - set var $addr = *(((unsigned long *) $stackp) + 1) - info symbol $addr - set $stackp = *((unsigned long *) $stackp) - end -end -document btthreadstack - dump a thread stack using the given task structure pointer -end - - -define btt - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - btthreadstack $next_t - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - btthreadstack $next_th - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document btt - dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER -end - -define btpid - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - btthreadstack $pid_task -end -document btpid - backtrace of pid -end - - -define trapinfo - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ - $pid_task.thread.cr2, $pid_task.thread.error_code - -end -document trapinfo - Run info threads and lookup pid of thread #1 - 'trapinfo ' will tell you by which trap & possibly - address the kernel panicked. -end - -define dump_log_idx - set $idx = $arg0 - if ($argc > 1) - set $prev_flags = $arg1 - else - set $prev_flags = 0 - end - set $msg = ((struct printk_log *) (log_buf + $idx)) - set $prefix = 1 - set $newline = 1 - set $log = log_buf + $idx + sizeof(*$msg) - - # prev & LOG_CONT && !(msg->flags & LOG_PREIX) - if (($prev_flags & 8) && !($msg->flags & 4)) - set $prefix = 0 - end - - # msg->flags & LOG_CONT - if ($msg->flags & 8) - # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) - if (($prev_flags & 8) && !($prev_flags & 2)) - set $prefix = 0 - end - # (!(msg->flags & LOG_NEWLINE)) - if (!($msg->flags & 2)) - set $newline = 0 - end - end - - if ($prefix) - printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 - end - if ($msg->text_len != 0) - eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len - end - if ($newline) - printf "\n" - end - if ($msg->dict_len > 0) - set $dict = $log + $msg->text_len - set $idx = 0 - set $line = 1 - while ($idx < $msg->dict_len) - if ($line) - printf " " - set $line = 0 - end - set $c = $dict[$idx] - if ($c == '\0') - printf "\n" - set $line = 1 - else - if ($c < ' ' || $c >= 127 || $c == '\\') - printf "\\x%02x", $c - else - printf "%c", $c - end - end - set $idx = $idx + 1 - end - printf "\n" - end -end -document dump_log_idx - Dump a single log given its index in the log buffer. The first - parameter is the index into log_buf, the second is optional and - specified the previous log buffer's flags, used for properly - formatting continued lines. -end - -define dmesg - set $i = log_first_idx - set $end_idx = log_first_idx - set $prev_flags = 0 - - while (1) - set $msg = ((struct printk_log *) (log_buf + $i)) - if ($msg->len == 0) - set $i = 0 - else - dump_log_idx $i $prev_flags - set $i = $i + $msg->len - set $prev_flags = $msg->flags - end - if ($i == $end_idx) - loop_break - end - end -end -document dmesg - print the kernel ring buffer -end diff --git a/Documentation/kdump/index.rst b/Documentation/kdump/index.rst deleted file mode 100644 index 2b17fcf6867a..000000000000 --- a/Documentation/kdump/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -:orphan: - -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -.. toctree:: - :maxdepth: 1 - - kdump - vmcoreinfo - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/kdump/kdump.rst b/Documentation/kdump/kdump.rst deleted file mode 100644 index ac7e131d2935..000000000000 --- a/Documentation/kdump/kdump.rst +++ /dev/null @@ -1,534 +0,0 @@ -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -Overview -======== - -Kdump uses kexec to quickly boot to a dump-capture kernel whenever a -dump of the system kernel's memory needs to be taken (for example, when -the system panics). The system kernel's memory image is preserved across -the reboot and is accessible to the dump-capture kernel. - -You can use common commands, such as cp and scp, to copy the -memory image to a dump file on the local disk, or across the network to -a remote system. - -Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -s390x, arm and arm64 architectures. - -When the system kernel boots, it reserves a small section of memory for -the dump-capture kernel. This ensures that ongoing Direct Memory Access -(DMA) from the system kernel does not corrupt the dump-capture kernel. -The kexec -p command loads the dump-capture kernel into this reserved -memory. - -On x86 machines, the first 640 KB of physical memory is needed to boot, -regardless of where the kernel loads. Therefore, kexec backs up this -region just before rebooting into the dump-capture kernel. - -Similarly on PPC64 machines first 32KB of physical memory is needed for -booting regardless of where the kernel is loaded and to support 64K page -size kexec backs up the first 64KB memory. - -For s390x, when kdump is triggered, the crashkernel region is exchanged -with the region [0, crashkernel region size] and then the kdump kernel -runs in [0, crashkernel region size]. Therefore no relocatable kernel is -needed for s390x. - -All of the necessary information about the system kernel's core image is -encoded in the ELF format, and stored in a reserved area of memory -before a crash. The physical address of the start of the ELF header is -passed to the dump-capture kernel through the elfcorehdr= boot -parameter. Optionally the size of the ELF header can also be passed -when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. - - -With the dump-capture kernel, you can access the memory image through -/proc/vmcore. This exports the dump as an ELF-format file that you can -write out using file copy commands such as cp or scp. Further, you can -use analysis tools such as the GNU Debugger (GDB) and the Crash tool to -debug the dump file. This method ensures that the dump pages are correctly -ordered. - - -Setup and Installation -====================== - -Install kexec-tools -------------------- - -1) Login as the root user. - -2) Download the kexec-tools user-space package from the following URL: - -http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz - -This is a symlink to the latest version. - -The latest kexec-tools git tree is available at: - -- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git -- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git - -There is also a gitweb interface available at -http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git - -More information about kexec-tools can be found at -http://horms.net/projects/kexec/ - -3) Unpack the tarball with the tar command, as follows:: - - tar xvpzf kexec-tools.tar.gz - -4) Change to the kexec-tools directory, as follows:: - - cd kexec-tools-VERSION - -5) Configure the package, as follows:: - - ./configure - -6) Compile the package, as follows:: - - make - -7) Install the package, as follows:: - - make install - - -Build the system and dump-capture kernels ------------------------------------------ -There are two possible methods of using Kdump. - -1) Build a separate custom dump-capture kernel for capturing the - kernel core dump. - -2) Or use the system kernel binary itself as dump-capture kernel and there is - no need to build a separate dump-capture kernel. This is possible - only with the architectures which support a relocatable kernel. As - of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support - relocatable kernel. - -Building a relocatable kernel is advantageous from the point of view that -one does not have to build a second kernel for capturing the dump. But -at the same time one might want to build a custom dump capture kernel -suitable to his needs. - -Following are the configuration setting required for system and -dump-capture kernels for enabling kdump support. - -System kernel config options ----------------------------- - -1) Enable "kexec system call" in "Processor type and features.":: - - CONFIG_KEXEC=y - -2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo - filesystems." This is usually enabled by default:: - - CONFIG_SYSFS=y - - Note that "sysfs file system support" might not appear in the "Pseudo - filesystems" menu if "Configure standard kernel features (for small - systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows:: - - grep 'CONFIG_SYSFS' .config - -3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: - - CONFIG_DEBUG_INFO=Y - - This causes the kernel to be built with debug symbols. The dump - analysis tools require a vmlinux with debug symbols in order to read - and analyze a dump file. - -Dump-capture kernel config options (Arch Independent) ------------------------------------------------------ - -1) Enable "kernel crash dumps" support under "Processor type and - features":: - - CONFIG_CRASH_DUMP=y - -2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: - - CONFIG_PROC_VMCORE=y - - (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) - -Dump-capture kernel config options (Arch Dependent, i386 and x86_64) --------------------------------------------------------------------- - -1) On i386, enable high memory support under "Processor type and - features":: - - CONFIG_HIGHMEM64G=y - - or:: - - CONFIG_HIGHMEM4G - -2) On i386 and x86_64, disable symmetric multi-processing support - under "Processor type and features":: - - CONFIG_SMP=n - - (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line - when loading the dump-capture kernel, see section "Load the Dump-capture - Kernel".) - -3) If one wants to build and use a relocatable kernel, - Enable "Build a relocatable kernel" support under "Processor type and - features":: - - CONFIG_RELOCATABLE=y - -4) Use a suitable value for "Physical address where the kernel is - loaded" (under "Processor type and features"). This only appears when - "kernel crash dumps" is enabled. A suitable value depends upon - whether kernel is relocatable or not. - - If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 - This will compile the kernel for physical address 1MB, but given the fact - kernel is relocatable, it can be run from any physical address hence - kexec boot loader will load it in memory region reserved for dump-capture - kernel. - - Otherwise it should be the start of memory region reserved for - second kernel using boot parameter "crashkernel=Y@X". Here X is - start of memory region reserved for dump-capture kernel. - Generally X is 16MB (0x1000000). So you can set - CONFIG_PHYSICAL_START=0x1000000 - -5) Make and install the kernel and its modules. DO NOT add this kernel - to the boot loader configuration files. - -Dump-capture kernel config options (Arch Dependent, ppc64) ----------------------------------------------------------- - -1) Enable "Build a kdump crash kernel" support under "Kernel" options:: - - CONFIG_CRASH_DUMP=y - -2) Enable "Build a relocatable kernel" support:: - - CONFIG_RELOCATABLE=y - - Make and install the kernel and its modules. - -Dump-capture kernel config options (Arch Dependent, ia64) ----------------------------------------------------------- - -- No specific options are required to create a dump-capture kernel - for ia64, other than those specified in the arch independent section - above. This means that it is possible to use the system kernel - as a dump-capture kernel if desired. - - The crashkernel region can be automatically placed by the system - kernel at run time. This is done by specifying the base address as 0, - or omitting it all together:: - - crashkernel=256M@0 - - or:: - - crashkernel=256M - - If the start address is specified, note that the start address of the - kernel will be aligned to 64Mb, so if the start address is not then - any space below the alignment point will be wasted. - -Dump-capture kernel config options (Arch Dependent, arm) ----------------------------------------------------------- - -- To use a relocatable kernel, - Enable "AUTO_ZRELADDR" support under "Boot" options:: - - AUTO_ZRELADDR=y - -Dump-capture kernel config options (Arch Dependent, arm64) ----------------------------------------------------------- - -- Please note that kvm of the dump-capture kernel will not be enabled - on non-VHE systems even if it is configured. This is because the CPU - will not be reset to EL2 on panic. - -Extended crashkernel syntax -=========================== - -While the "crashkernel=size[@offset]" syntax is sufficient for most -configurations, sometimes it's handy to have the reserved memory dependent -on the value of System RAM -- that's mostly for distributors that pre-setup -the kernel command line to avoid a unbootable system after some memory has -been removed from the machine. - -The syntax is:: - - crashkernel=:[,:,...][@offset] - range=start-[end] - -For example:: - - crashkernel=512M-2G:64M,2G-:128M - -This would mean: - - 1) if the RAM is smaller than 512M, then don't reserve anything - (this is the "rescue" case) - 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M - 3) if the RAM size is larger than 2G, then reserve 128M - - - -Boot into System Kernel -======================= - -1) Update the boot loader (such as grub, yaboot, or lilo) configuration - files as necessary. - -2) Boot the system kernel with the boot parameter "crashkernel=Y@X", - where Y specifies how much memory to reserve for the dump-capture kernel - and X specifies the beginning of this reserved memory. For example, - "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory - starting at physical address 0x01000000 (16MB) for the dump-capture kernel. - - On x86 and x86_64, use "crashkernel=64M@16M". - - On ppc64, use "crashkernel=128M@32M". - - On ia64, 256M@256M is a generous value that typically works. - The region may be automatically placed on ia64, see the - dump-capture kernel config option notes above. - If use sparse memory, the size should be rounded to GRANULE boundaries. - - On s390x, typically use "crashkernel=xxM". The value of xx is dependent - on the memory consumption of the kdump system. In general this is not - dependent on the memory size of the production system. - - On arm, the use of "crashkernel=Y@X" is no longer necessary; the - kernel will automatically locate the crash kernel image within the - first 512MB of RAM if X is not given. - - On arm64, use "crashkernel=Y[@X]". Note that the start address of - the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). - -Load the Dump-capture Kernel -============================ - -After booting to the system kernel, dump-capture kernel needs to be -loaded. - -Based on the architecture and type of image (relocatable or not), one -can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz -of dump-capture kernel. Following is the summary. - -For i386 and x86_64: - - - Use vmlinux if kernel is not relocatable. - - Use bzImage/vmlinuz if kernel is relocatable. - -For ppc64: - - - Use vmlinux - -For ia64: - - - Use vmlinux or vmlinuz.gz - -For s390x: - - - Use image or bzImage - -For arm: - - - Use zImage - -For arm64: - - - Use vmlinux or Image - -If you are using an uncompressed vmlinux image then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= --args-linux \ - --append="root= " - -If you are using a compressed bzImage/vmlinuz, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -If you are using a compressed zImage, then use following command -to load dump-capture kernel:: - - kexec --type zImage -p \ - --initrd= \ - --dtb= \ - --append="root= " - -If you are using an uncompressed Image, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -Please note, that --args-linux does not need to be specified for ia64. -It is planned to make this a no-op on that architecture, but for now -it should be omitted - -Following are the arch specific command line options to be used while -loading dump-capture kernel. - -For i386, x86_64 and ia64: - - "1 irqpoll maxcpus=1 reset_devices" - -For ppc64: - - "1 maxcpus=1 noirqdistrib reset_devices" - -For s390x: - - "1 maxcpus=1 cgroup_disable=memory" - -For arm: - - "1 maxcpus=1 reset_devices" - -For arm64: - - "1 maxcpus=1 reset_devices" - -Notes on loading the dump-capture kernel: - -* By default, the ELF headers are stored in ELF64 format to support - systems with more than 4GB memory. On i386, kexec automatically checks if - the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. - So, on non-PAE systems, ELF32 is always used. - - The --elf32-core-headers option can be used to force the generation of ELF32 - headers. This is necessary because GDB currently cannot open vmcore files - with ELF64 headers on 32-bit systems. - -* The "irqpoll" boot parameter reduces driver initialization failures - due to shared interrupts in the dump-capture kernel. - -* You must specify in the format corresponding to the root - device name in the output of mount command. - -* Boot parameter "1" boots the dump-capture kernel into single-user - mode without networking. If you want networking, use "3". - -* We generally don't have to bring up a SMP kernel just to capture the - dump. Hence generally it is useful either to build a UP dump-capture - kernel or specify maxcpus=1 option while loading dump-capture kernel. - Note, though maxcpus always works, you had better replace it with - nr_cpus to save memory if supported by the current ARCH, such as x86. - -* You should enable multi-cpu support in dump-capture kernel if you intend - to use multi-thread programs with it, such as parallel dump feature of - makedumpfile. Otherwise, the multi-thread program may have a great - performance degradation. To enable multi-cpu support, you should bring up an - SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] - options while loading it. - -* For s390x there are two kdump modes: If a ELF header is specified with - the elfcorehdr= kernel parameter, it is used by the kdump kernel as it - is done on all other architectures. If no elfcorehdr= kernel parameter is - specified, the s390x kdump kernel dynamically creates the header. The - second mode has the advantage that for CPU and memory hotplug, kdump has - not to be reloaded with kexec_load(). - -* For s390x systems with many attached devices the "cio_ignore" kernel - parameter should be used for the kdump kernel in order to prevent allocation - of kernel memory for devices that are not relevant for kdump. The same - applies to systems that use SCSI/FCP devices. In that case the - "allow_lun_scan" zfcp module parameter should be set to zero before - setting FCP devices online. - -Kernel Panic -============ - -After successfully loading the dump-capture kernel as previously -described, the system will reboot into the dump-capture kernel if a -system crash is triggered. Trigger points are located in panic(), -die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). - -The following conditions will execute a crash trigger point: - -If a hard lockup is detected and "NMI watchdog" is configured, the system -will boot into the dump-capture kernel ( die_nmi() ). - -If die() is called, and it happens to be a thread with pid 0 or 1, or die() -is called inside interrupt context or die() is called and panic_on_oops is set, -the system will boot into the dump-capture kernel. - -On powerpc systems when a soft-reset is generated, die() is called by all cpus -and the system will boot into the dump-capture kernel. - -For testing purposes, you can trigger a crash by using "ALT-SysRq-c", -"echo c > /proc/sysrq-trigger" or write a module to force the panic. - -Write Out the Dump File -======================= - -After the dump-capture kernel is booted, write out the dump file with -the following command:: - - cp /proc/vmcore - - -Analysis -======== - -Before analyzing the dump image, you should reboot into a stable kernel. - -You can do limited analysis using GDB on the dump file copied out of -/proc/vmcore. Use the debug vmlinux built with -g and run the following -command:: - - gdb vmlinux - -Stack trace for the task on processor 0, register display, and memory -display work fine. - -Note: GDB cannot analyze core files generated in ELF64 format for x86. -On systems with a maximum of 4GB of memory, you can generate -ELF32-format headers using the --elf32-core-headers kernel option on the -dump kernel. - -You can also use the Crash utility to analyze dump files in Kdump -format. Crash is available on Dave Anderson's site at the following URL: - - http://people.redhat.com/~anderson/ - -Trigger Kdump on WARN() -======================= - -The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This -will cause a kdump to occur at the panic() call. In cases where a user wants -to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 -to achieve the same behaviour. - -Contact -======= - -- Vivek Goyal (vgoyal@redhat.com) -- Maneesh Soni (maneesh@in.ibm.com) - -GDB macros -========== - -.. include:: gdbmacros.txt - :literal: diff --git a/Documentation/kdump/vmcoreinfo.rst b/Documentation/kdump/vmcoreinfo.rst deleted file mode 100644 index 007a6b86e0ee..000000000000 --- a/Documentation/kdump/vmcoreinfo.rst +++ /dev/null @@ -1,488 +0,0 @@ -========== -VMCOREINFO -========== - -What is it? -=========== - -VMCOREINFO is a special ELF note section. It contains various -information from the kernel like structure size, page size, symbol -values, field offsets, etc. These data are packed into an ELF note -section and used by user-space tools like crash and makedumpfile to -analyze a kernel's memory layout. - -Common variables -================ - -init_uts_ns.name.release ------------------------- - -The version of the Linux kernel. Used to find the corresponding source -code from which the kernel has been built. For example, crash uses it to -find the corresponding vmlinux in order to process vmcore. - -PAGE_SIZE ---------- - -The size of a page. It is the smallest unit of data used by the memory -management facilities. It is usually 4096 bytes of size and a page is -aligned on 4096 bytes. Used for computing page addresses. - -init_uts_ns ------------ - -The UTS namespace which is used to isolate two specific elements of the -system that relate to the uname(2) system call. It is named after the -data structure used to store information returned by the uname(2) system -call. - -User-space tools can get the kernel name, host name, kernel release -number, kernel version, architecture name and OS type from it. - -node_online_map ---------------- - -An array node_states[N_ONLINE] which represents the set of online nodes -in a system, one bit position per node number. Used to keep track of -which nodes are in the system and online. - -swapper_pg_dir --------------- - -The global page directory pointer of the kernel. Used to translate -virtual to physical addresses. - -_stext ------- - -Defines the beginning of the text section. In general, _stext indicates -the kernel start address. Used to convert a virtual address from the -direct kernel map to a physical address. - -vmap_area_list --------------- - -Stores the virtual area list. makedumpfile gets the vmalloc start value -from this variable and its value is necessary for vmalloc translation. - -mem_map -------- - -Physical addresses are translated to struct pages by treating them as -an index into the mem_map array. Right-shifting a physical address -PAGE_SHIFT bits converts it into a page frame number which is an index -into that mem_map array. - -Used to map an address to the corresponding struct page. - -contig_page_data ----------------- - -Makedumpfile gets the pglist_data structure from this symbol, which is -used to describe the memory layout. - -User-space tools use this to exclude free pages when dumping memory. - -mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) --------------------------------------------------------------------------- - -The address of the mem_section array, its length, structure size, and -the section_mem_map offset. - -It exists in the sparse memory mapping model, and it is also somewhat -similar to the mem_map variable, both of them are used to translate an -address. - -page ----- - -The size of a page structure. struct page is an important data structure -and it is widely used to compute contiguous memory. - -pglist_data ------------ - -The size of a pglist_data structure. This value is used to check if the -pglist_data structure is valid. It is also used for checking the memory -type. - -zone ----- - -The size of a zone structure. This value is used to check if the zone -structure has been found. It is also used for excluding free pages. - -free_area ---------- - -The size of a free_area structure. It indicates whether the free_area -structure is valid or not. Useful when excluding free pages. - -list_head ---------- - -The size of a list_head structure. Used when iterating lists in a -post-mortem analysis session. - -nodemask_t ----------- - -The size of a nodemask_t type. Used to compute the number of online -nodes. - -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) -------------------------------------------------------------------------------------------------- - -User-space tools compute their values based on the offset of these -variables. The variables are used when excluding unnecessary pages. - -(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) ------------------------------------------------------------------------------------------ - -On NUMA machines, each NUMA node has a pg_data_t to describe its memory -layout. On UMA machines there is a single pglist_data which describes the -whole memory. - -These values are used to check the memory type and to compute the -virtual address for memory map. - -(zone, free_area|vm_stat|spanned_pages) ---------------------------------------- - -Each node is divided into a number of blocks called zones which -represent ranges within memory. A zone is described by a structure zone. - -User-space tools compute required values based on the offset of these -variables. - -(free_area, free_list) ----------------------- - -Offset of the free_list's member. This value is used to compute the number -of free pages. - -Each zone has a free_area structure array called free_area[MAX_ORDER]. -The free_list represents a linked list of free page blocks. - -(list_head, next|prev) ----------------------- - -Offsets of the list_head's members. list_head is used to define a -circular linked list. User-space tools need these in order to traverse -lists. - -(vmap_area, va_start|list) --------------------------- - -Offsets of the vmap_area's members. They carry vmalloc-specific -information. Makedumpfile gets the start address of the vmalloc region -from this. - -(zone.free_area, MAX_ORDER) ---------------------------- - -Free areas descriptor. User-space tools use this value to iterate the -free_area ranges. MAX_ORDER is used by the zone buddy allocator. - -log_first_idx -------------- - -Index of the first record stored in the buffer log_buf. Used by -user-space tools to read the strings in the log_buf. - -log_buf -------- - -Console output is written to the ring buffer log_buf at index -log_first_idx. Used to get the kernel log. - -log_buf_len ------------ - -log_buf's length. - -clear_idx ---------- - -The index that the next printk() record to read after the last clear -command. It indicates the first record after the last SYSLOG_ACTION -_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump -the dmesg log. - -log_next_idx ------------- - -The index of the next record to store in the buffer log_buf. Used to -compute the index of the current buffer position. - -printk_log ----------- - -The size of a structure printk_log. Used to compute the size of -messages, and extract dmesg log. It encapsulates header information for -log_buf, such as timestamp, syslog level, etc. - -(printk_log, ts_nsec|len|text_len|dict_len) -------------------------------------------- - -It represents field offsets in struct printk_log. User space tools -parse it and check whether the values of printk_log's members have been -changed. - -(free_area.free_list, MIGRATE_TYPES) ------------------------------------- - -The number of migrate types for pages. The free_list is described by the -array. Used by tools to compute the number of free pages. - -NR_FREE_PAGES -------------- - -On linux-2.6.21 or later, the number of free pages is in -vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. - -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask ------------------------------------------------------------------------------- - -Page attributes. These flags are used to filter various unnecessary for -dumping pages. - -PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) ------------------------------------------------------------------------------ - -More page attributes. These flags are used to filter various unnecessary for -dumping pages. - - -HUGETLB_PAGE_DTOR ------------------ - -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile -excludes these pages. - -x86_64 -====== - -phys_base ---------- - -Used to convert the virtual address of an exported kernel symbol to its -corresponding physical address. - -init_top_pgt ------------- - -Used to walk through the whole page table and convert virtual addresses -to physical addresses. The init_top_pgt is somewhat similar to -swapper_pg_dir, but it is only used in x86_64. - -pgtable_l5_enabled ------------------- - -User-space tools need to know whether the crash kernel was in 5-level -paging mode. - -node_data ---------- - -This is a struct pglist_data array and stores all NUMA nodes -information. Makedumpfile gets the pglist_data structure from it. - -(node_data, MAX_NUMNODES) -------------------------- - -The maximum number of nodes in system. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -KERNEL_IMAGE_SIZE ------------------ - -Currently unused by Makedumpfile. Used to compute the module virtual -address by Crash. - -sme_mask --------- - -AMD-specific with SME support: it indicates the secure memory encryption -mask. Makedumpfile tools need to know whether the crash kernel was -encrypted. If SME is enabled in the first kernel, the crash kernel's -page table entries (pgd/pud/pmd/pte) contain the memory encryption -mask. This is used to remove the SME mask and obtain the true physical -address. - -Currently, sme_mask stores the value of the C-bit position. If needed, -additional SME-relevant info can be placed in that variable. - -For example:: - - [ misc ][ enc bit ][ other misc SME info ] - 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 - 63 59 55 51 47 43 39 35 31 27 ... 3 - -x86_32 -====== - -X86_PAE -------- - -Denotes whether physical address extensions are enabled. It has the cost -of a higher page table lookup overhead, and also consumes more page -table space per process. Used to check whether PAE was enabled in the -crash kernel when converting virtual addresses to physical addresses. - -ia64 -==== - -pgdat_list|(pgdat_list, MAX_NUMNODES) -------------------------------------- - -pg_data_t array storing all NUMA nodes information. MAX_NUMNODES -indicates the number of the nodes. - -node_memblk|(node_memblk, NR_NODE_MEMBLKS) ------------------------------------------- - -List of node memory chunks. Filled when parsing the SRAT table to obtain -information about memory nodes. NR_NODE_MEMBLKS indicates the number of -node memory chunks. - -These values are used to compute the number of nodes the crashed kernel used. - -node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) ----------------------------------------------------------------- - -The size of a struct node_memblk_s and the offsets of the -node_memblk_s's members. Used to compute the number of nodes. - -PGTABLE_3|PGTABLE_4 -------------------- - -User-space tools need to know whether the crash kernel was in 3-level or -4-level paging mode. Used to distinguish the page table. - -ARM64 -===== - -VA_BITS -------- - -The maximum number of bits for virtual addresses. Used to compute the -virtual memory ranges. - -kimage_voffset --------------- - -The offset between the kernel virtual and physical mappings. Used to -translate virtual to physical addresses. - -PHYS_OFFSET ------------ - -Indicates the physical address of the start of memory. Similar to -kimage_voffset, which is used to translate virtual to physical -addresses. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -arm -=== - -ARM_LPAE --------- - -It indicates whether the crash kernel supports large physical address -extensions. Used to translate virtual to physical addresses. - -s390 -==== - -lowcore_ptr ------------ - -An array with a pointer to the lowcore of every CPU. Used to print the -psw and all registers information. - -high_memory ------------ - -Used to get the vmalloc_start address from the high_memory symbol. - -(lowcore_ptr, NR_CPUS) ----------------------- - -The maximum number of CPUs. - -powerpc -======= - - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -contig_page_data ----------------- - -See above. - -vmemmap_list ------------- - -The vmemmap_list maintains the entire vmemmap physical mapping. Used -to get vmemmap list count and populated vmemmap regions info. If the -vmemmap address translation information is stored in the crash kernel, -it is used to translate vmemmap kernel virtual addresses. - -mmu_vmemmap_psize ------------------ - -The size of a page. Used to translate virtual to physical addresses. - -mmu_psize_defs --------------- - -Page size definitions, i.e. 4k, 64k, or 16M. - -Used to make vtop translations. - -vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) --------------------------------------------------------------------------------------------- - -The vmemmap virtual address space management does not have a traditional -page table to track which virtual struct pages are backed by a physical -mapping. The virtual to physical mappings are tracked in a simple linked -list format. - -User-space tools need to know the offset of list, phys and virt_addr -when computing the count of vmemmap regions. - -mmu_psize_def|(mmu_psize_def, shift) ------------------------------------- - -The size of a struct mmu_psize_def and the offset of mmu_psize_def's -member. - -Used in vtop translations. - -sh -== - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -X2TLB ------ - -Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 0c41d6d463f3..10e7f4d16c14 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -59,7 +59,7 @@ as follows: the default calculated size. Use this option if default boot memory size is not sufficient for second kernel to boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/kdump/kdump.rst. If any offset is + refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is provided in crashkernel= parameter, it will be ignored as fadump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt index 368ddd05b304..c5f3bda7abcb 100644 --- a/Documentation/translations/zh_CN/oops-tracing.txt +++ b/Documentation/translations/zh_CN/oops-tracing.txt @@ -53,8 +53,8 @@ cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“ (2)用串口终端启动(请参看Documentation/admin-guide/serial-console.rst),运行一个null modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。 -(3)使用Kdump(请参看Documentation/kdump/kdump.rst), -使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 +(3)使用Kdump(请参看Documentation/admin-guide/kdump/kdump.rst), +使用在Documentation/admin-guide/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 环形缓冲区。 完整信息 -- cgit From e7751617dd0599ceadf4221cb08e04307b00aa1f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 11:47:10 -0300 Subject: docs: blockdev: add it to the admin-guide The blockdev book basically contains user-faced documentation. Signed-off-by: Mauro Carvalho Chehab --- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 +++++++++++++++++++++ .../blockdev/drbd/DRBD-data-packets.svg | 459 ++++++++++++++++ .../admin-guide/blockdev/drbd/conn-states-8.dot | 18 + .../blockdev/drbd/data-structure-v9.rst | 42 ++ .../admin-guide/blockdev/drbd/disk-states-8.dot | 16 + .../drbd/drbd-connection-state-overview.dot | 85 +++ .../admin-guide/blockdev/drbd/figures.rst | 28 + Documentation/admin-guide/blockdev/drbd/index.rst | 19 + .../admin-guide/blockdev/drbd/node-states-8.dot | 13 + Documentation/admin-guide/blockdev/floppy.rst | 255 +++++++++ Documentation/admin-guide/blockdev/index.rst | 14 + Documentation/admin-guide/blockdev/nbd.rst | 31 ++ Documentation/admin-guide/blockdev/paride.rst | 439 +++++++++++++++ Documentation/admin-guide/blockdev/ramdisk.rst | 177 +++++++ Documentation/admin-guide/blockdev/zram.rst | 422 +++++++++++++++ Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 18 +- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 --------------------- Documentation/blockdev/drbd/DRBD-data-packets.svg | 459 ---------------- Documentation/blockdev/drbd/conn-states-8.dot | 18 - Documentation/blockdev/drbd/data-structure-v9.rst | 42 -- Documentation/blockdev/drbd/disk-states-8.dot | 16 - .../drbd/drbd-connection-state-overview.dot | 85 --- Documentation/blockdev/drbd/figures.rst | 28 - Documentation/blockdev/drbd/index.rst | 19 - Documentation/blockdev/drbd/node-states-8.dot | 14 - Documentation/blockdev/floppy.rst | 255 --------- Documentation/blockdev/index.rst | 16 - Documentation/blockdev/nbd.rst | 31 -- Documentation/blockdev/paride.rst | 439 --------------- Documentation/blockdev/ramdisk.rst | 177 ------- Documentation/blockdev/zram.rst | 422 --------------- 32 files changed, 2616 insertions(+), 2618 deletions(-) create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/conn-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/disk-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/figures.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/index.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/node-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/floppy.rst create mode 100644 Documentation/admin-guide/blockdev/index.rst create mode 100644 Documentation/admin-guide/blockdev/nbd.rst create mode 100644 Documentation/admin-guide/blockdev/paride.rst create mode 100644 Documentation/admin-guide/blockdev/ramdisk.rst create mode 100644 Documentation/admin-guide/blockdev/zram.rst delete mode 100644 Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/DRBD-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/conn-states-8.dot delete mode 100644 Documentation/blockdev/drbd/data-structure-v9.rst delete mode 100644 Documentation/blockdev/drbd/disk-states-8.dot delete mode 100644 Documentation/blockdev/drbd/drbd-connection-state-overview.dot delete mode 100644 Documentation/blockdev/drbd/figures.rst delete mode 100644 Documentation/blockdev/drbd/index.rst delete mode 100644 Documentation/blockdev/drbd/node-states-8.dot delete mode 100644 Documentation/blockdev/floppy.rst delete mode 100644 Documentation/blockdev/index.rst delete mode 100644 Documentation/blockdev/nbd.rst delete mode 100644 Documentation/blockdev/paride.rst delete mode 100644 Documentation/blockdev/ramdisk.rst delete mode 100644 Documentation/blockdev/zram.rst (limited to 'Documentation') diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg new file mode 100644 index 000000000000..f87cfa0dc2fb --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg @@ -0,0 +1,588 @@ + + + + + + Master slide + + + + + + + + + + RSDataReply + + + + + + + CsumRSRequest + + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + + WriteAck + + + + got_BlockAck() + + + Checksum based Resync, case not in sync + + + DRBD-8.3 data flow + + + w_e_send_csum() + + + + + + + + RSIsInSync + + + + + + + CsumRSRequest + + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + got_IsInSync() + + + Checksum based Resync, case in sync + + + + + + + + + + OVReply + + + + + + + OVRequest + + + + receive_OVRequest() + + + drbd_endio_read_sec() + + + w_e_end_ov_req() + + + receive_OVReply() + + + drbd_endio_read_sec() + + + w_e_end_ov_reply() + + + + + + OVResult + + + + got_OVResult() + + + Online verify + + + w_make_ov_request() + + + + + + + + drbd_endio_read_sec() + + + w_make_resync_request() + + + w_e_send_csum() + + + + + drbd_endio_read_sec() + + + + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg new file mode 100644 index 000000000000..48a1e2165fec --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg @@ -0,0 +1,459 @@ + + + + + + Master slide + + + + + + + + + RSDataReply + + + + + RSDataRequest + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_rsdata_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + WriteAck + + + got_BlockAck() + + + Resync blocks, 4-32K + + + + + + + WriteAck + + + + + Data + + + drbd_make_request() + + + receive_Data() + + + drbd_endio_write_sec() + + + e_end_block() + + + got_BlockAck() + + + Regular mirrored write, 512-32K + + + w_send_dblock() + + + + + drbd_endio_write_pri() + + + + + + + DataReply + + + + + DataRequest + + + drbd_make_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_data_req() + + + Drawing + + receive_DataReply() + + + + Diskless read, 512-32K + + + w_send_read_req() + + + DRBD 8 data flow + + + + + + al_begin_io() + + + al_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot new file mode 100644 index 000000000000..025e8cf5e64a --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot @@ -0,0 +1,18 @@ +digraph conn_states { + StandAllone -> WFConnection [ label = "ioctl_set_net()" ] + WFConnection -> Unconnected [ label = "unable to bind()" ] + WFConnection -> WFReportParams [ label = "in connect() after accept" ] + WFReportParams -> StandAllone [ label = "checks in receive_param()" ] + WFReportParams -> Connected [ label = "in receive_param()" ] + WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] + WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] + WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] + WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] + SyncSource -> Connected + SyncTarget -> Connected + SyncSource -> PausedSyncS + SyncTarget -> PausedSyncT + PausedSyncS -> SyncSource + PausedSyncT -> SyncTarget + Connected -> WFConnection [ label = "* on network error" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst new file mode 100644 index 000000000000..66036b901644 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst @@ -0,0 +1,42 @@ +================================ +kernel data structure for DRBD-9 +================================ + +This describes the in kernel data structure for DRBD-9. Starting with +Linux v3.14 we are reorganizing DRBD to use this data structure. + +Basic Data Structure +==================== + +A node has a number of DRBD resources. Each such resource has a number of +devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD +device is represented by a block device locally. + +The DRBD objects are interconnected to form a matrix as depicted below; a +drbd_peer_device object sits at each intersection between a drbd_device and a +drbd_connection:: + + /--------------+---------------+.....+---------------\ + | resource | device | | device | + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + +--------------+---------------+.....+---------------+ + : : : : : + : : : : : + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + \--------------+---------------+.....+---------------/ + +In this table, horizontally, devices can be accessed from resources by their +volume number. Likewise, peer_devices can be accessed from connections by +their volume number. Objects in the vertical direction are connected by double +linked lists. There are back pointers from peer_devices to their connections a +devices, and from connections and devices to their resource. + +All resources are in the drbd_resources double-linked list. In addition, all +devices can be accessed by their minor device number via the drbd_devices idr. + +The drbd_resource, drbd_connection, and drbd_device objects are reference +counted. The peer_device objects only serve to establish the links between +devices and connections; their lifetime is determined by the lifetime of the +device and connection which they reference. diff --git a/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot new file mode 100644 index 000000000000..d06cfb46fb98 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot @@ -0,0 +1,16 @@ +digraph disk_states { + Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] + Diskless -> Consistent [ label = "ioctl_set_disk()" ] + Diskless -> Outdated [ label = "ioctl_set_disk()" ] + Consistent -> Outdated [ label = "receive_param()" ] + Consistent -> UpToDate [ label = "receive_param()" ] + Consistent -> Inconsistent [ label = "start resync" ] + Outdated -> Inconsistent [ label = "start resync" ] + UpToDate -> Inconsistent [ label = "ioctl_replicate" ] + Inconsistent -> UpToDate [ label = "resync completed" ] + Consistent -> Failed [ label = "io completion error" ] + Outdated -> Failed [ label = "io completion error" ] + UpToDate -> Failed [ label = "io completion error" ] + Inconsistent -> Failed [ label = "io completion error" ] + Failed -> Diskless [ label = "sending notify to peer" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot new file mode 100644 index 000000000000..6d9cf0a7b11d --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot @@ -0,0 +1,85 @@ +// vim: set sw=2 sts=2 : +digraph { + rankdir=BT + bgcolor=white + + node [shape=plaintext] + node [fontcolor=black] + + StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] + + node [fontcolor=lightgray] + + Unconnected [ label=Unconnected ] + + CommTrouble [ shape=record, + label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] + + node [fontcolor=gray] + + subgraph cluster_try_connect { + label="try to connect, handshake" + rank=max + WFConnection [ label=WFConnection ] + WFReportParams [ label=WFReportParams ] + } + + TearDown [ label=TearDown ] + + Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] + + node [fontcolor=lightblue] + + StartingSyncS [ label=StartingSyncS ] + StartingSyncT [ label=StartingSyncT ] + + subgraph cluster_bitmap_exchange { + node [fontcolor=red] + fontcolor=red + label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" + + WFBitMapT [ label=WFBitMapT ] + WFSyncUUID [ label=WFSyncUUID ] + WFBitMapS [ label=WFBitMapS ] + } + + node [fontcolor=blue] + + cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] + + node [shape=box,fontcolor=black] + + // drbdadm [label="drbdadm connect"] + // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] + // comm_error [label="communication trouble"] + + // + // edges + // -------------------------------------- + + StandAlone -> Unconnected [ label="drbdadm connect" ] + Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] + Unconnected -> WFConnection [ label="receiver thread is started" ] + WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] + + WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] + WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] + + WFReportParams -> WFBitMapS + WFReportParams -> WFBitMapT + WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] + + WFBitMapS -> cluster_resync:S + WFSyncUUID -> cluster_resync:T + + edge [color=green] + cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] + + edge [color=red] + WFReportParams -> CommTrouble + Connected -> CommTrouble + cluster_resync:any -> CommTrouble + edge [color=black] + CommTrouble -> Unconnected [label="receiver thread is stopped" ] + +} diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst new file mode 100644 index 000000000000..3e3fd4b8a478 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/figures.rst @@ -0,0 +1,28 @@ +.. The here included files are intended to help understand the implementation + +Data flows that Relate some functions, and write packets +======================================================== + +.. kernel-figure:: DRBD-8.3-data-packets.svg + :alt: DRBD-8.3-data-packets.svg + :align: center + +.. kernel-figure:: DRBD-data-packets.svg + :alt: DRBD-data-packets.svg + :align: center + + +Sub graphs of DRBD's state transitions +====================================== + +.. kernel-figure:: conn-states-8.dot + :alt: conn-states-8.dot + :align: center + +.. kernel-figure:: disk-states-8.dot + :alt: disk-states-8.dot + :align: center + +.. kernel-figure:: node-states-8.dot + :alt: node-states-8.dot + :align: center diff --git a/Documentation/admin-guide/blockdev/drbd/index.rst b/Documentation/admin-guide/blockdev/drbd/index.rst new file mode 100644 index 000000000000..68ecd5c113e9 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/index.rst @@ -0,0 +1,19 @@ +========================================== +Distributed Replicated Block Device - DRBD +========================================== + +Description +=========== + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +.. toctree:: + :maxdepth: 1 + + data-structure-v9 + figures diff --git a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot new file mode 100644 index 000000000000..bfa54e1f8016 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot @@ -0,0 +1,13 @@ +digraph node_states { + Secondary -> Primary [ label = "ioctl_set_state()" ] + Primary -> Secondary [ label = "ioctl_set_state()" ] +} + +digraph peer_states { + Secondary -> Primary [ label = "recv state packet" ] + Primary -> Secondary [ label = "recv state packet" ] + Primary -> Unknown [ label = "connection lost" ] + Secondary -> Unknown [ label = "connection lost" ] + Unknown -> Primary [ label = "connected" ] + Unknown -> Secondary [ label = "connected" ] +} diff --git a/Documentation/admin-guide/blockdev/floppy.rst b/Documentation/admin-guide/blockdev/floppy.rst new file mode 100644 index 000000000000..4a8f31cf4139 --- /dev/null +++ b/Documentation/admin-guide/blockdev/floppy.rst @@ -0,0 +1,255 @@ +============= +Floppy Driver +============= + +FAQ list: +========= + +A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + +The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + +Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad):: + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9:: + + append = "floppy=thinkpad" + +Several floppy related options may be given, example:: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + +If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + +If you use the floppy driver as a module, use the following syntax:: + + modprobe floppy floppy="" + +Example:: + + modprobe floppy floppy="omnibook messages" + +If you need certain options enabled every time you load the floppy driver, +you can put:: + + options floppy floppy="omnibook messages" + +in a configuration file in /etc/modprobe.d/. + + +The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc / floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook / floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + == ================================== + 0 Use the value of the physical CMOS + 1 5 1/4 DD + 2 5 1/4 HD + 3 3 1/2 DD + 4 3 1/2 HD + 5 3 1/2 ED + 6 3 1/2 ED + 16 unknown or not installed + == ================================== + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts / floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate:: + + PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases. + + +Supporting utilities and additional documentation: +================================================== + +Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + + http://fdutils.linux.lu + +The fdutils releases can be found at: + + http://fdutils.linux.lu/download.html + + http://www.tux.org/pub/knaff/fdutils/ + + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + +If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + +Be sure to read the FAQ before mailing/posting any bug reports! + +Alain + +Changelog +========= + +10-30-2004 : + Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : + Original Document diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst new file mode 100644 index 000000000000..20a738d9d047 --- /dev/null +++ b/Documentation/admin-guide/blockdev/index.rst @@ -0,0 +1,14 @@ +=========================== +The Linux RapidIO Subsystem +=========================== + +.. toctree:: + :maxdepth: 1 + + floppy + nbd + paride + ramdisk + zram + + drbd/index diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst new file mode 100644 index 000000000000..d78dfe559dcf --- /dev/null +++ b/Documentation/admin-guide/blockdev/nbd.rst @@ -0,0 +1,31 @@ +================================== +Network Block Device (TCP version) +================================== + +1) Overview +----------- + +What is it: With this compiled in the kernel (or as a module), Linux +can use a remote server as one of its block devices. So every time +the client computer wants to read, e.g., /dev/nb0, it sends a +request over TCP to the server, which will reply with the data read. +This can be used for stations with low disk space (or even diskless) +to borrow disk space from another computer. +Unlike NFS, it is possible to put any filesystem on it, etc. + +For more information, or to download the nbd-client and nbd-server +tools, go to http://nbd.sf.net/. + +The nbd kernel module need only be installed on the client +system, as the nbd-server is completely in userspace. In fact, +the nbd-server has been successfully ported to other operating +systems, including Windows. + +A) NBD parameters +----------------- + +max_part + Number of partitions per device (default: 0). + +nbds_max + Number of block devices that should be initialized (default: 16). diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst new file mode 100644 index 000000000000..87b4278bf314 --- /dev/null +++ b/Documentation/admin-guide/blockdev/paride.rst @@ -0,0 +1,439 @@ +=================================== +Linux and parallel port IDE devices +=================================== + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction +=============== + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + - MicroSolutions backpack CD-ROM + - MicroSolutions backpack PD/CD + - MicroSolutions backpack hard-drives + - MicroSolutions backpack 8000t tape drive + - SyQuest EZ-135, EZ-230 & SparQ drives + - Avatar Shark + - Imation Superdisk LS-120 + - Maxell Superdisk LS-120 + - FreeCom Power CD + - Hewlett-Packard 5GB and 8GB tape drives + - Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + === ============= + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + === ============= + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + ==== ====================================== ==== + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + ==== ====================================== ==== + + +2. Using the PARIDE subsystem +============================= + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + ================ ============ ====== ======== + Manufacturer Model Driver Protocol + ================ ============ ====== ======== + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + ================ ============ ====== ======== + +2.1 Configuring built-in drivers +--------------------------------- + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like:: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command:: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules +---------------------------------------------- + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: + using these drivers with the "kerneld" automatic module loading + system is not recommended for beginners, and is not documented here. + +Note 2: + if you build PARPORT support as a loadable module, PARIDE must + also be built as loadable modules, and PARPORT must be loaded before + the PARIDE modules. + +To use PARIDE, you must begin by:: + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example:: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command:: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378:: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device +-------------------------- + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute:: + + #!/bin/bash + # + # mkd -- a script to create the device special files for the PARIDE subsystem + # + function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 + } + # + function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done + } + # + cd /dev + # + for u in 0 1 2 3 ; do pd $u ; done + for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done + for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done + for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done + for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done + for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done + # + # end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like:: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system:: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver +------------------ + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver +------------------------ + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver +------------------------ + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting +================== + +3.1 Use EPP mode if you can +---------------------------- + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay +------------------------- + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset +------------------------------------- + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing:: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help +------------------------------------------------------ + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in:: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help +--------------------------------- + +You can join the linux-parport mailing list by sending a mail message +to: + + linux-parport-request@torque.net + +with the single word:: + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/admin-guide/blockdev/ramdisk.rst b/Documentation/admin-guide/blockdev/ramdisk.rst new file mode 100644 index 000000000000..b7c2268f8dec --- /dev/null +++ b/Documentation/admin-guide/blockdev/ramdisk.rst @@ -0,0 +1,177 @@ +========================================== +Using the RAM disk block device with Linux +========================================== + +.. Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Parameters +--------------------------------- + +2a) Kernel Command Line Parameters + + ramdisk_size=N + Size of the ramdisk. + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB). + +2b) Module parameters + + rd_nr + /dev/ramX devices created. + + max_part + Maximum partition number. + + rd_size + See ramdisk_size. + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below:: + + ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF + ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 + ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do:: + + /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use:: + + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" + +Since the default start = 0 and the default prompt = 1, you could use:: + + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +----------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create:: + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example:: + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing:: + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy:: + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB):: + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552:: + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + + + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : + Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : + Original Document diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst new file mode 100644 index 000000000000..6eccf13219ff --- /dev/null +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -0,0 +1,422 @@ +======================================== +zram: Compressed RAM based block devices +======================================== + +Introduction +============ + +The zram module creates RAM based block devices named /dev/zram +( = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram/ + +Usage +===== + +There are several ways to configure and manage zram device(-s): + +a) using zram and zram_control sysfs attributes +b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). + +In this document we will describe only 'manual' zram configuration steps, +IOW, zram and zram_control sysfs attributes. + +In order to get a better idea about zramctl please consult util-linux +documentation, zramctl man-page or `zramctl --help`. Please be informed +that zram maintainers do not develop/maintain util-linux or zramctl, should +you have any questions please contact util-linux@vger.kernel.org + +Following shows a typical sequence of steps for using zram. + +WARNING +======= + +For the sake of simplicity we skip error checking parts in most of the +examples below. However, it is your sole responsibility to handle errors. + +zram sysfs attributes always return negative values in case of errors. +The list of possible return codes: + +======== ============================================================= +-EBUSY an attempt to modify an attribute that cannot be changed once + the device has been initialised. Please reset device first; +-ENOMEM zram was not able to allocate enough memory to fulfil your + needs; +-EINVAL invalid input has been provided. +======== ============================================================= + +If you use 'echo', the returned value that is changed by 'echo' utility, +and, in general case, something like:: + + echo 3 > /sys/block/zram0/max_comp_streams + if [ $? -ne 0 ]; + handle_error + fi + +should suffice. + +1) Load Module +============== + +:: + + modprobe zram num_devices=4 + This creates 4 devices: /dev/zram{0,1,2,3} + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. + +2) Set max number of compression streams +======================================== + +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available:: + + cat /sys/block/zram0/max_comp_streams + +3) Select compression algorithm +=============================== + +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). + +Examples:: + + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +For the time being, the `comp_algorithm` content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + +4) Set Disksize +=============== + +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples:: + + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize + + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize + +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + +5) Set memory limit: Optional +============================= + +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples:: + + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate +=========== + +:: + + mkswap /dev/zram0 + swapon /dev/zram0 + + mkfs.ext4 /dev/zram1 + mount /dev/zram1 /tmp + +7) Add/remove zram devices +========================== + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram) or error code. + +Example:: + + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute:: + + echo X > /sys/class/zram-control/hot_remove + +8) Stats +======== + +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attributes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +====================== ====== =============================================== +Name access description +====================== ====== =============================================== +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +mem_used_max WO reset the `mem_used_max` counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can + use to store the compressed data +writeback_limit WO specifies the maximum amount of write IO zram + can write out to backing device as 4KB unit +writeback_limit_enable RW show and set writeback_limit feature +max_comp_streams RW the number of possible concurrent compress + operations +comp_algorithm RW show and change the compression algorithm +compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out +idle WO mark allocated slot as idle +====================== ====== =============================================== + + +User space is advised to use the following files to read the device statistics. + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.rst for +details. + +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + + ============= ============================================================= + failed_reads The number of failed reads + failed_writes The number of failed writes + invalid_io The number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + + a) the number of pages freed because of swap slot free + notifications + b) the number of pages freed because of + REQ_OP_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. + ============= ============================================================= + +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + + ================ ============================================================= + orig_data_size uncompressed size of data stored in this disk. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + same_pages the number of same element filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + ================ ============================================================= + +File /sys/block/zram/bd_stat + +The stat file represents device's backing device statistics. It consists of +a single line of text and contains the following stats separated by whitespace: + + ============== ============================================================= + bd_count size of data written in backing device. + Unit: 4K bytes + bd_reads the number of reads from backing device + Unit: 4K bytes + bd_writes the number of writes to backing device + Unit: 4K bytes + ============== ============================================================= + +9) Deactivate +============= + +:: + + swapoff /dev/zram0 + umount /dev/zram1 + +10) Reset +========= + + Write any positive value to 'reset' sysfs node:: + + echo 1 > /sys/block/zram0/reset + echo 1 > /sys/block/zram1/reset + + This frees all the memory allocated for the given device and + resets the disksize to zero. You must set the disksize again + before reusing the device. + +Optional Feature +================ + +writeback +--------- + +With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page +to backing storage rather than keeping it in memory. +To use the feature, admin should set up backing device via:: + + echo /dev/sda5 > /sys/block/zramX/backing_dev + +before disksize setting. It supports only partition at this moment. +If admin want to use incompressible page writeback, they could do via:: + + echo huge > /sys/block/zramX/write + +To use idle page writeback, first, user need to declare zram pages +as idle:: + + echo all > /sys/block/zramX/idle + +From now on, any pages on zram are idle pages. The idle mark +will be removed until someone request access of the block. +IOW, unless there is access request, those pages are still idle pages. + +Admin can request writeback of those idle pages at right timing via:: + + echo idle > /sys/block/zramX/writeback + +With the command, zram writeback idle pages from memory to the storage. + +If there are lots of write IO with flash device, potentially, it has +flash wearout problem so that admin needs to design write limitation +to guarantee storage health for entire product life. + +To overcome the concern, zram supports "writeback_limit" feature. +The "writeback_limit_enable"'s default value is 0 so that it doesn't limit +any writeback. IOW, if admin want to apply writeback budget, he should +enable writeback_limit_enable via:: + + $ echo 1 > /sys/block/zramX/writeback_limit_enable + +Once writeback_limit_enable is set, zram doesn't allow any writeback +until admin set the budget via /sys/block/zramX/writeback_limit. + +(If admin doesn't enable writeback_limit_enable, writeback_limit's value +assigned via /sys/block/zramX/writeback_limit is meaninless.) + +If admin want to limit writeback as per-day 400M, he could do it +like below:: + + $ MB_SHIFT=20 + $ 4K_SHIFT=12 + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit. + $ echo 1 > /sys/block/zram0/writeback_limit_enable + +If admin want to allow further write again once the bugdet is exausted, +he could do it like below:: + + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit + +If admin want to see remaining writeback budget since he set:: + + $ cat /sys/block/zramX/writeback_limit + +If admin want to disable writeback limit, he could do:: + + $ echo 0 > /sys/block/zramX/writeback_limit_enable + +The writeback_limit count will reset whenever you reset zram(e.g., +system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of +writeback happened until you reset the zram to allocate extra writeback +budget in next setting is user's job. + +If admin want to measure writeback count in a certain period, he could +know it via /sys/block/zram0/bd_stat's 3rd column. + +memory tracking +=============== + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. + +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: + + 300 75.033841 .wh. + 301 63.806904 s... + 302 63.806919 ..hi + +First column + zram's block index. +Second column + access time since the system was booted +Third column + state of the block: + + s: + same page + w: + written page to backing store + h: + huge page + i: + idle page + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + +Nitin Gupta +ngupta@vflare.org diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 5b63182ceb5f..9228fbf5ce4e 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -73,6 +73,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + blockdev/index ext4 binderfs pm/index diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e645b3ab4b6f..78576aa45cce 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1249,7 +1249,7 @@ See also Documentation/fault-injection/. floppy= [HW] - See Documentation/blockdev/floppy.rst. + See Documentation/admin-guide/blockdev/floppy.rst. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -2234,7 +2234,7 @@ memblock=debug [KNL] Enable memblock debug messages. load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -3268,7 +3268,7 @@ pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/blockdev/paride.rst. + See also Documentation/admin-guide/blockdev/paride.rst. pci=option[,option...] [PCI] various PCI subsystem options. @@ -3512,7 +3512,7 @@ needed on a platform with proper driver support. pd. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -3527,10 +3527,10 @@ and performance comparison. pf. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pg. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.rst. @@ -3642,7 +3642,7 @@ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. psi= [KNL] Enable or disable pressure stall information tracking. @@ -3664,7 +3664,7 @@ pstore.backend= Specify the name of the pstore backend to use pt. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pti= [X86_64] Control Page Table Isolation of user and kernel address spaces. Disabling this feature @@ -3693,7 +3693,7 @@ See Documentation/admin-guide/md.rst. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. random.trust_cpu={on,off} [KNL] Enable or disable trusting the use of the diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg deleted file mode 100644 index f87cfa0dc2fb..000000000000 --- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg +++ /dev/null @@ -1,588 +0,0 @@ - - - - - - Master slide - - - - - - - - - - RSDataReply - - - - - - - CsumRSRequest - - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - - WriteAck - - - - got_BlockAck() - - - Checksum based Resync, case not in sync - - - DRBD-8.3 data flow - - - w_e_send_csum() - - - - - - - - RSIsInSync - - - - - - - CsumRSRequest - - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - got_IsInSync() - - - Checksum based Resync, case in sync - - - - - - - - - - OVReply - - - - - - - OVRequest - - - - receive_OVRequest() - - - drbd_endio_read_sec() - - - w_e_end_ov_req() - - - receive_OVReply() - - - drbd_endio_read_sec() - - - w_e_end_ov_reply() - - - - - - OVResult - - - - got_OVResult() - - - Online verify - - - w_make_ov_request() - - - - - - - - drbd_endio_read_sec() - - - w_make_resync_request() - - - w_e_send_csum() - - - - - drbd_endio_read_sec() - - - - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg deleted file mode 100644 index 48a1e2165fec..000000000000 --- a/Documentation/blockdev/drbd/DRBD-data-packets.svg +++ /dev/null @@ -1,459 +0,0 @@ - - - - - - Master slide - - - - - - - - - RSDataReply - - - - - RSDataRequest - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_rsdata_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - WriteAck - - - got_BlockAck() - - - Resync blocks, 4-32K - - - - - - - WriteAck - - - - - Data - - - drbd_make_request() - - - receive_Data() - - - drbd_endio_write_sec() - - - e_end_block() - - - got_BlockAck() - - - Regular mirrored write, 512-32K - - - w_send_dblock() - - - - - drbd_endio_write_pri() - - - - - - - DataReply - - - - - DataRequest - - - drbd_make_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_data_req() - - - Drawing - - receive_DataReply() - - - - Diskless read, 512-32K - - - w_send_read_req() - - - DRBD 8 data flow - - - - - - al_begin_io() - - - al_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot deleted file mode 100644 index 025e8cf5e64a..000000000000 --- a/Documentation/blockdev/drbd/conn-states-8.dot +++ /dev/null @@ -1,18 +0,0 @@ -digraph conn_states { - StandAllone -> WFConnection [ label = "ioctl_set_net()" ] - WFConnection -> Unconnected [ label = "unable to bind()" ] - WFConnection -> WFReportParams [ label = "in connect() after accept" ] - WFReportParams -> StandAllone [ label = "checks in receive_param()" ] - WFReportParams -> Connected [ label = "in receive_param()" ] - WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] - WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] - WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] - WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] - SyncSource -> Connected - SyncTarget -> Connected - SyncSource -> PausedSyncS - SyncTarget -> PausedSyncT - PausedSyncS -> SyncSource - PausedSyncT -> SyncTarget - Connected -> WFConnection [ label = "* on network error" ] -} diff --git a/Documentation/blockdev/drbd/data-structure-v9.rst b/Documentation/blockdev/drbd/data-structure-v9.rst deleted file mode 100644 index 66036b901644..000000000000 --- a/Documentation/blockdev/drbd/data-structure-v9.rst +++ /dev/null @@ -1,42 +0,0 @@ -================================ -kernel data structure for DRBD-9 -================================ - -This describes the in kernel data structure for DRBD-9. Starting with -Linux v3.14 we are reorganizing DRBD to use this data structure. - -Basic Data Structure -==================== - -A node has a number of DRBD resources. Each such resource has a number of -devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD -device is represented by a block device locally. - -The DRBD objects are interconnected to form a matrix as depicted below; a -drbd_peer_device object sits at each intersection between a drbd_device and a -drbd_connection:: - - /--------------+---------------+.....+---------------\ - | resource | device | | device | - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - +--------------+---------------+.....+---------------+ - : : : : : - : : : : : - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - \--------------+---------------+.....+---------------/ - -In this table, horizontally, devices can be accessed from resources by their -volume number. Likewise, peer_devices can be accessed from connections by -their volume number. Objects in the vertical direction are connected by double -linked lists. There are back pointers from peer_devices to their connections a -devices, and from connections and devices to their resource. - -All resources are in the drbd_resources double-linked list. In addition, all -devices can be accessed by their minor device number via the drbd_devices idr. - -The drbd_resource, drbd_connection, and drbd_device objects are reference -counted. The peer_device objects only serve to establish the links between -devices and connections; their lifetime is determined by the lifetime of the -device and connection which they reference. diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot deleted file mode 100644 index d06cfb46fb98..000000000000 --- a/Documentation/blockdev/drbd/disk-states-8.dot +++ /dev/null @@ -1,16 +0,0 @@ -digraph disk_states { - Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] - Diskless -> Consistent [ label = "ioctl_set_disk()" ] - Diskless -> Outdated [ label = "ioctl_set_disk()" ] - Consistent -> Outdated [ label = "receive_param()" ] - Consistent -> UpToDate [ label = "receive_param()" ] - Consistent -> Inconsistent [ label = "start resync" ] - Outdated -> Inconsistent [ label = "start resync" ] - UpToDate -> Inconsistent [ label = "ioctl_replicate" ] - Inconsistent -> UpToDate [ label = "resync completed" ] - Consistent -> Failed [ label = "io completion error" ] - Outdated -> Failed [ label = "io completion error" ] - UpToDate -> Failed [ label = "io completion error" ] - Inconsistent -> Failed [ label = "io completion error" ] - Failed -> Diskless [ label = "sending notify to peer" ] -} diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot deleted file mode 100644 index 6d9cf0a7b11d..000000000000 --- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot +++ /dev/null @@ -1,85 +0,0 @@ -// vim: set sw=2 sts=2 : -digraph { - rankdir=BT - bgcolor=white - - node [shape=plaintext] - node [fontcolor=black] - - StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] - - node [fontcolor=lightgray] - - Unconnected [ label=Unconnected ] - - CommTrouble [ shape=record, - label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] - - node [fontcolor=gray] - - subgraph cluster_try_connect { - label="try to connect, handshake" - rank=max - WFConnection [ label=WFConnection ] - WFReportParams [ label=WFReportParams ] - } - - TearDown [ label=TearDown ] - - Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] - - node [fontcolor=lightblue] - - StartingSyncS [ label=StartingSyncS ] - StartingSyncT [ label=StartingSyncT ] - - subgraph cluster_bitmap_exchange { - node [fontcolor=red] - fontcolor=red - label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" - - WFBitMapT [ label=WFBitMapT ] - WFSyncUUID [ label=WFSyncUUID ] - WFBitMapS [ label=WFBitMapS ] - } - - node [fontcolor=blue] - - cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] - - node [shape=box,fontcolor=black] - - // drbdadm [label="drbdadm connect"] - // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] - // comm_error [label="communication trouble"] - - // - // edges - // -------------------------------------- - - StandAlone -> Unconnected [ label="drbdadm connect" ] - Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] - Unconnected -> WFConnection [ label="receiver thread is started" ] - WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] - - WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] - WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] - - WFReportParams -> WFBitMapS - WFReportParams -> WFBitMapT - WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] - - WFBitMapS -> cluster_resync:S - WFSyncUUID -> cluster_resync:T - - edge [color=green] - cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] - - edge [color=red] - WFReportParams -> CommTrouble - Connected -> CommTrouble - cluster_resync:any -> CommTrouble - edge [color=black] - CommTrouble -> Unconnected [label="receiver thread is stopped" ] - -} diff --git a/Documentation/blockdev/drbd/figures.rst b/Documentation/blockdev/drbd/figures.rst deleted file mode 100644 index 3e3fd4b8a478..000000000000 --- a/Documentation/blockdev/drbd/figures.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. The here included files are intended to help understand the implementation - -Data flows that Relate some functions, and write packets -======================================================== - -.. kernel-figure:: DRBD-8.3-data-packets.svg - :alt: DRBD-8.3-data-packets.svg - :align: center - -.. kernel-figure:: DRBD-data-packets.svg - :alt: DRBD-data-packets.svg - :align: center - - -Sub graphs of DRBD's state transitions -====================================== - -.. kernel-figure:: conn-states-8.dot - :alt: conn-states-8.dot - :align: center - -.. kernel-figure:: disk-states-8.dot - :alt: disk-states-8.dot - :align: center - -.. kernel-figure:: node-states-8.dot - :alt: node-states-8.dot - :align: center diff --git a/Documentation/blockdev/drbd/index.rst b/Documentation/blockdev/drbd/index.rst deleted file mode 100644 index 68ecd5c113e9..000000000000 --- a/Documentation/blockdev/drbd/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -========================================== -Distributed Replicated Block Device - DRBD -========================================== - -Description -=========== - - DRBD is a shared-nothing, synchronously replicated block device. It - is designed to serve as a building block for high availability - clusters and in this context, is a "drop-in" replacement for shared - storage. Simplistically, you could see it as a network RAID 1. - - Please visit http://www.drbd.org to find out more. - -.. toctree:: - :maxdepth: 1 - - data-structure-v9 - figures diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot deleted file mode 100644 index 4a2b00c23547..000000000000 --- a/Documentation/blockdev/drbd/node-states-8.dot +++ /dev/null @@ -1,14 +0,0 @@ -digraph node_states { - Secondary -> Primary [ label = "ioctl_set_state()" ] - Primary -> Secondary [ label = "ioctl_set_state()" ] -} - -digraph peer_states { - Secondary -> Primary [ label = "recv state packet" ] - Primary -> Secondary [ label = "recv state packet" ] - Primary -> Unknown [ label = "connection lost" ] - Secondary -> Unknown [ label = "connection lost" ] - Unknown -> Primary [ label = "connected" ] - Unknown -> Secondary [ label = "connected" ] -} - diff --git a/Documentation/blockdev/floppy.rst b/Documentation/blockdev/floppy.rst deleted file mode 100644 index 4a8f31cf4139..000000000000 --- a/Documentation/blockdev/floppy.rst +++ /dev/null @@ -1,255 +0,0 @@ -============= -Floppy Driver -============= - -FAQ list: -========= - -A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - -The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - -Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad):: - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9:: - - append = "floppy=thinkpad" - -Several floppy related options may be given, example:: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - -If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - -If you use the floppy driver as a module, use the following syntax:: - - modprobe floppy floppy="" - -Example:: - - modprobe floppy floppy="omnibook messages" - -If you need certain options enabled every time you load the floppy driver, -you can put:: - - options floppy floppy="omnibook messages" - -in a configuration file in /etc/modprobe.d/. - - -The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc / floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook / floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - == ================================== - 0 Use the value of the physical CMOS - 1 5 1/4 DD - 2 5 1/4 HD - 3 3 1/2 DD - 4 3 1/2 HD - 5 3 1/2 ED - 6 3 1/2 ED - 16 unknown or not installed - == ================================== - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts / floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate:: - - PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases. - - -Supporting utilities and additional documentation: -================================================== - -Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - - http://fdutils.linux.lu - -The fdutils releases can be found at: - - http://fdutils.linux.lu/download.html - - http://www.tux.org/pub/knaff/fdutils/ - - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - -If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - -Be sure to read the FAQ before mailing/posting any bug reports! - -Alain - -Changelog -========= - -10-30-2004 : - Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : - Original Document diff --git a/Documentation/blockdev/index.rst b/Documentation/blockdev/index.rst deleted file mode 100644 index a9af6ed8b4aa..000000000000 --- a/Documentation/blockdev/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -:orphan: - -=========================== -The Linux RapidIO Subsystem -=========================== - -.. toctree:: - :maxdepth: 1 - - floppy - nbd - paride - ramdisk - zram - - drbd/index diff --git a/Documentation/blockdev/nbd.rst b/Documentation/blockdev/nbd.rst deleted file mode 100644 index d78dfe559dcf..000000000000 --- a/Documentation/blockdev/nbd.rst +++ /dev/null @@ -1,31 +0,0 @@ -================================== -Network Block Device (TCP version) -================================== - -1) Overview ------------ - -What is it: With this compiled in the kernel (or as a module), Linux -can use a remote server as one of its block devices. So every time -the client computer wants to read, e.g., /dev/nb0, it sends a -request over TCP to the server, which will reply with the data read. -This can be used for stations with low disk space (or even diskless) -to borrow disk space from another computer. -Unlike NFS, it is possible to put any filesystem on it, etc. - -For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. - -The nbd kernel module need only be installed on the client -system, as the nbd-server is completely in userspace. In fact, -the nbd-server has been successfully ported to other operating -systems, including Windows. - -A) NBD parameters ------------------ - -max_part - Number of partitions per device (default: 0). - -nbds_max - Number of block devices that should be initialized (default: 16). diff --git a/Documentation/blockdev/paride.rst b/Documentation/blockdev/paride.rst deleted file mode 100644 index 87b4278bf314..000000000000 --- a/Documentation/blockdev/paride.rst +++ /dev/null @@ -1,439 +0,0 @@ -=================================== -Linux and parallel port IDE devices -=================================== - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction -=============== - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - - MicroSolutions backpack CD-ROM - - MicroSolutions backpack PD/CD - - MicroSolutions backpack hard-drives - - MicroSolutions backpack 8000t tape drive - - SyQuest EZ-135, EZ-230 & SparQ drives - - Avatar Shark - - Imation Superdisk LS-120 - - Maxell Superdisk LS-120 - - FreeCom Power CD - - Hewlett-Packard 5GB and 8GB tape drives - - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - === ============= - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - === ============= - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - ==== ====================================== ==== - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - ==== ====================================== ==== - - -2. Using the PARIDE subsystem -============================= - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - ================ ============ ====== ======== - Manufacturer Model Driver Protocol - ================ ============ ====== ======== - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - ================ ============ ====== ======== - -2.1 Configuring built-in drivers ---------------------------------- - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like:: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command:: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules ----------------------------------------------- - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: - using these drivers with the "kerneld" automatic module loading - system is not recommended for beginners, and is not documented here. - -Note 2: - if you build PARPORT support as a loadable module, PARIDE must - also be built as loadable modules, and PARPORT must be loaded before - the PARIDE modules. - -To use PARIDE, you must begin by:: - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example:: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command:: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378:: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device --------------------------- - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute:: - - #!/bin/bash - # - # mkd -- a script to create the device special files for the PARIDE subsystem - # - function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 - } - # - function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done - } - # - cd /dev - # - for u in 0 1 2 3 ; do pd $u ; done - for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done - for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done - for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done - for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done - for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done - # - # end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like:: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system:: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver ------------------- - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver ------------------------- - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver ------------------------- - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting -================== - -3.1 Use EPP mode if you can ----------------------------- - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay -------------------------- - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset -------------------------------------- - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing:: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help ------------------------------------------------------- - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in:: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help ---------------------------------- - -You can join the linux-parport mailing list by sending a mail message -to: - - linux-parport-request@torque.net - -with the single word:: - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/blockdev/ramdisk.rst b/Documentation/blockdev/ramdisk.rst deleted file mode 100644 index b7c2268f8dec..000000000000 --- a/Documentation/blockdev/ramdisk.rst +++ /dev/null @@ -1,177 +0,0 @@ -========================================== -Using the RAM disk block device with Linux -========================================== - -.. Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Parameters ---------------------------------- - -2a) Kernel Command Line Parameters - - ramdisk_size=N - Size of the ramdisk. - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB). - -2b) Module parameters - - rd_nr - /dev/ramX devices created. - - max_part - Maximum partition number. - - rd_size - See ramdisk_size. - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below:: - - ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF - ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 - ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do:: - - /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use:: - - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" - -Since the default start = 0 and the default prompt = 1, you could use:: - - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ------------------------------------------------ - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create:: - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example:: - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing:: - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy:: - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB):: - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552:: - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - - - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : - Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : - Original Document diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst deleted file mode 100644 index 6eccf13219ff..000000000000 --- a/Documentation/blockdev/zram.rst +++ /dev/null @@ -1,422 +0,0 @@ -======================================== -zram: Compressed RAM based block devices -======================================== - -Introduction -============ - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -Usage -===== - -There are several ways to configure and manage zram device(-s): - -a) using zram and zram_control sysfs attributes -b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). - -In this document we will describe only 'manual' zram configuration steps, -IOW, zram and zram_control sysfs attributes. - -In order to get a better idea about zramctl please consult util-linux -documentation, zramctl man-page or `zramctl --help`. Please be informed -that zram maintainers do not develop/maintain util-linux or zramctl, should -you have any questions please contact util-linux@vger.kernel.org - -Following shows a typical sequence of steps for using zram. - -WARNING -======= - -For the sake of simplicity we skip error checking parts in most of the -examples below. However, it is your sole responsibility to handle errors. - -zram sysfs attributes always return negative values in case of errors. -The list of possible return codes: - -======== ============================================================= --EBUSY an attempt to modify an attribute that cannot be changed once - the device has been initialised. Please reset device first; --ENOMEM zram was not able to allocate enough memory to fulfil your - needs; --EINVAL invalid input has been provided. -======== ============================================================= - -If you use 'echo', the returned value that is changed by 'echo' utility, -and, in general case, something like:: - - echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; - handle_error - fi - -should suffice. - -1) Load Module -============== - -:: - - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - -num_devices parameter is optional and tells zram how many devices should be -pre-created. Default: 1. - -2) Set max number of compression streams -======================================== - -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus -allowing several concurrent compression operations. The number of -allocated compression streams goes down when some of the CPUs -become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. - -To find out how many streams are currently available:: - - cat /sys/block/zram0/max_comp_streams - -3) Select compression algorithm -=============================== - -Using comp_algorithm device attribute one can see available and -currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised -there is no way to change compression algorithm). - -Examples:: - - #show supported compression algorithms - cat /sys/block/zram0/comp_algorithm - lzo [lz4] - - #select lzo compression algorithm - echo lzo > /sys/block/zram0/comp_algorithm - -For the time being, the `comp_algorithm` content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). - -4) Set Disksize -=============== - -Set disk size by writing the value to sysfs node 'disksize'. -The value can be either in bytes or you can use mem suffixes. -Examples:: - - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -Note: -There is little point creating a zram of greater than twice the size of memory -since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the -size of the disk when not in use so a huge zram is wasteful. - -5) Set memory limit: Optional -============================= - -Set memory limit by writing the value to sysfs node 'mem_limit'. -The value can be either in bytes or you can use mem suffixes. -In addition, you could change the value in runtime. -Examples:: - - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit - -6) Activate -=========== - -:: - - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -7) Add/remove zram devices -========================== - -zram provides a control interface, which enables dynamic (on-demand) device -addition and removal. - -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. - -Example:: - - cat /sys/class/zram-control/hot_add - 1 - -To remove the existing /dev/zramX device (where X is a device id) -execute:: - - echo X > /sys/class/zram-control/hot_remove - -8) Stats -======== - -Per-device statistics are exported as various nodes under /sys/block/zram/ - -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. - -====================== ====== =============================================== -Name access description -====================== ====== =============================================== -disksize RW show and set the device's disk size -initstate RO shows the initialization state of the device -reset WO trigger device reset -mem_used_max WO reset the `mem_used_max` counter (see later) -mem_limit WO specifies the maximum amount of memory ZRAM can - use to store the compressed data -writeback_limit WO specifies the maximum amount of write IO zram - can write out to backing device as 4KB unit -writeback_limit_enable RW show and set writeback_limit feature -max_comp_streams RW the number of possible concurrent compress - operations -comp_algorithm RW show and change the compression algorithm -compact WO trigger memory compaction -debug_stat RO this file is used for zram debugging purposes -backing_dev RW set up backend storage for zram to write out -idle WO mark allocated slot as idle -====================== ====== =============================================== - - -User space is advised to use the following files to read the device statistics. - -File /sys/block/zram/stat - -Represents block layer statistics. Read Documentation/block/stat.rst for -details. - -File /sys/block/zram/io_stat - -The stat file represents device's I/O statistics not accounted by block -layer and, thus, not available in zram/stat file. It consists of a -single line of text and contains the following stats separated by -whitespace: - - ============= ============================================================= - failed_reads The number of failed reads - failed_writes The number of failed writes - invalid_io The number of non-page-size-aligned I/O requests - notify_free Depending on device usage scenario it may account - - a) the number of pages freed because of swap slot free - notifications - b) the number of pages freed because of - REQ_OP_DISCARD requests sent by bio. The former ones are - sent to a swap block device when a swap slot is freed, - which implies that this disk is being used as a swap disk. - - The latter ones are sent by filesystem mounted with - discard option, whenever some data blocks are getting - discarded. - ============= ============================================================= - -File /sys/block/zram/mm_stat - -The stat file represents device's mm statistics. It consists of a single -line of text and contains the following stats separated by whitespace: - - ================ ============================================================= - orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. - Unit: bytes - compr_data_size compressed size of data stored in this disk - mem_used_total the amount of memory allocated for this disk. This - includes allocator fragmentation and metadata overhead, - allocated for this disk. So, allocator space efficiency - can be calculated using compr_data_size and this statistic. - Unit: bytes - mem_limit the maximum amount of memory ZRAM can use to store - the compressed data - mem_used_max the maximum amount of memory zram have consumed to - store the data - same_pages the number of same element filled pages written to this disk. - No memory is allocated for such pages. - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - ================ ============================================================= - -File /sys/block/zram/bd_stat - -The stat file represents device's backing device statistics. It consists of -a single line of text and contains the following stats separated by whitespace: - - ============== ============================================================= - bd_count size of data written in backing device. - Unit: 4K bytes - bd_reads the number of reads from backing device - Unit: 4K bytes - bd_writes the number of writes to backing device - Unit: 4K bytes - ============== ============================================================= - -9) Deactivate -============= - -:: - - swapoff /dev/zram0 - umount /dev/zram1 - -10) Reset -========= - - Write any positive value to 'reset' sysfs node:: - - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Optional Feature -================ - -writeback ---------- - -With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page -to backing storage rather than keeping it in memory. -To use the feature, admin should set up backing device via:: - - echo /dev/sda5 > /sys/block/zramX/backing_dev - -before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via:: - - echo huge > /sys/block/zramX/write - -To use idle page writeback, first, user need to declare zram pages -as idle:: - - echo all > /sys/block/zramX/idle - -From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. -IOW, unless there is access request, those pages are still idle pages. - -Admin can request writeback of those idle pages at right timing via:: - - echo idle > /sys/block/zramX/writeback - -With the command, zram writeback idle pages from memory to the storage. - -If there are lots of write IO with flash device, potentially, it has -flash wearout problem so that admin needs to design write limitation -to guarantee storage health for entire product life. - -To overcome the concern, zram supports "writeback_limit" feature. -The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should -enable writeback_limit_enable via:: - - $ echo 1 > /sys/block/zramX/writeback_limit_enable - -Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. - -(If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) - -If admin want to limit writeback as per-day 400M, he could do it -like below:: - - $ MB_SHIFT=20 - $ 4K_SHIFT=12 - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit. - $ echo 1 > /sys/block/zram0/writeback_limit_enable - -If admin want to allow further write again once the bugdet is exausted, -he could do it like below:: - - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit - -If admin want to see remaining writeback budget since he set:: - - $ cat /sys/block/zramX/writeback_limit - -If admin want to disable writeback limit, he could do:: - - $ echo 0 > /sys/block/zramX/writeback_limit_enable - -The writeback_limit count will reset whenever you reset zram(e.g., -system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of -writeback happened until you reset the zram to allocate extra writeback -budget in next setting is user's job. - -If admin want to measure writeback count in a certain period, he could -know it via /sys/block/zram0/bd_stat's 3rd column. - -memory tracking -=============== - -With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the -zram block. It could be useful to catch cold or incompressible -pages of the process with*pagemap. - -If you enable the feature, you could see block state via -/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: - - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi - -First column - zram's block index. -Second column - access time since the system was booted -Third column - state of the block: - - s: - same page - w: - written page to backing store - h: - huge page - i: - idle page - -First line of above example says 300th block is accessed at 75.033841sec -and the block's state is huge so it is written back to the backing -storage. It's a debugging feature so anyone shouldn't rely on it to work -properly. - -Nitin Gupta -ngupta@vflare.org -- cgit From 4d3beaa06d3536aa8968d1828a66bd5ccb5036ac Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 19 Apr 2019 21:39:29 -0300 Subject: docs: security: move some books to it and update The following files belong to security: Documentation/security/LSM.rst -> Documentation/security/lsm-development.rst Documentation/lsm.txt -> Documentation/security/lsm.rst Documentation/SAK.txt -> Documentation/security/sak.rst Documentation/siphash.txt -> Documentation/security/siphash.rst Signed-off-by: Mauro Carvalho Chehab --- Documentation/SAK.txt | 91 ------------- Documentation/lsm.txt | 201 ---------------------------- Documentation/security/LSM.rst | 17 --- Documentation/security/index.rst | 5 +- Documentation/security/lsm-development.rst | 17 +++ Documentation/security/lsm.rst | 201 ++++++++++++++++++++++++++++ Documentation/security/sak.rst | 91 +++++++++++++ Documentation/security/siphash.rst | 189 ++++++++++++++++++++++++++ Documentation/security/tpm/index.rst | 1 + Documentation/security/tpm/xen-tpmfront.rst | 2 - Documentation/siphash.txt | 189 -------------------------- 11 files changed, 503 insertions(+), 501 deletions(-) delete mode 100644 Documentation/SAK.txt delete mode 100644 Documentation/lsm.txt delete mode 100644 Documentation/security/LSM.rst create mode 100644 Documentation/security/lsm-development.rst create mode 100644 Documentation/security/lsm.rst create mode 100644 Documentation/security/sak.rst create mode 100644 Documentation/security/siphash.rst delete mode 100644 Documentation/siphash.txt (limited to 'Documentation') diff --git a/Documentation/SAK.txt b/Documentation/SAK.txt deleted file mode 100644 index 260e1d3687bd..000000000000 --- a/Documentation/SAK.txt +++ /dev/null @@ -1,91 +0,0 @@ -========================================= -Linux Secure Attention Key (SAK) handling -========================================= - -:Date: 18 March 2001 -:Author: Andrew Morton - -An operating system's Secure Attention Key is a security tool which is -provided as protection against trojan password capturing programs. It -is an undefeatable way of killing all programs which could be -masquerading as login applications. Users need to be taught to enter -this key sequence before they log in to the system. - -From the PC keyboard, Linux has two similar but different ways of -providing SAK. One is the ALT-SYSRQ-K sequence. You shouldn't use -this sequence. It is only available if the kernel was compiled with -sysrq support. - -The proper way of generating a SAK is to define the key sequence using -``loadkeys``. This will work whether or not sysrq support is compiled -into the kernel. - -SAK works correctly when the keyboard is in raw mode. This means that -once defined, SAK will kill a running X server. If the system is in -run level 5, the X server will restart. This is what you want to -happen. - -What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot -the machine. CTRL-ALT-BACKSPACE is magical to the X server. We'll -choose CTRL-ALT-PAUSE. - -In your rc.sysinit (or rc.local) file, add the command:: - - echo "control alt keycode 101 = SAK" | /bin/loadkeys - -And that's it! Only the superuser may reprogram the SAK key. - - -.. note:: - - 1. Linux SAK is said to be not a "true SAK" as is required by - systems which implement C2 level security. This author does not - know why. - - - 2. On the PC keyboard, SAK kills all applications which have - /dev/console opened. - - Unfortunately this includes a number of things which you don't - actually want killed. This is because these applications are - incorrectly holding /dev/console open. Be sure to complain to your - Linux distributor about this! - - You can identify processes which will be killed by SAK with the - command:: - - # ls -l /proc/[0-9]*/fd/* | grep console - l-wx------ 1 root root 64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console - - Then:: - - # ps aux|grep 579 - root 579 0.0 0.1 1088 436 ? S 00:43 0:00 gpm -t ps/2 - - So ``gpm`` will be killed by SAK. This is a bug in gpm. It should - be closing standard input. You can work around this by finding the - initscript which launches gpm and changing it thusly: - - Old:: - - daemon gpm - - New:: - - daemon gpm < /dev/null - - Vixie cron also seems to have this problem, and needs the same treatment. - - Also, one prominent Linux distribution has the following three - lines in its rc.sysinit and rc scripts:: - - exec 3<&0 - exec 4>&1 - exec 5>&2 - - These commands cause **all** daemons which are launched by the - initscripts to have file descriptors 3, 4 and 5 attached to - /dev/console. So SAK kills them all. A workaround is to simply - delete these lines, but this may cause system management - applications to malfunction - test everything well. - diff --git a/Documentation/lsm.txt b/Documentation/lsm.txt deleted file mode 100644 index ad4dfd020e0d..000000000000 --- a/Documentation/lsm.txt +++ /dev/null @@ -1,201 +0,0 @@ -======================================================== -Linux Security Modules: General Security Hooks for Linux -======================================================== - -:Author: Stephen Smalley -:Author: Timothy Fraser -:Author: Chris Vance - -.. note:: - - The APIs described in this book are outdated. - -Introduction -============ - -In March 2001, the National Security Agency (NSA) gave a presentation -about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel Summit. -SELinux is an implementation of flexible and fine-grained -nondiscretionary access controls in the Linux kernel, originally -implemented as its own particular kernel patch. Several other security -projects (e.g. RSBAC, Medusa) have also developed flexible access -control architectures for the Linux kernel, and various projects have -developed particular access control models for Linux (e.g. LIDS, DTE, -SubDomain). Each project has developed and maintained its own kernel -patch to support its security needs. - -In response to the NSA presentation, Linus Torvalds made a set of -remarks that described a security framework he would be willing to -consider for inclusion in the mainstream Linux kernel. He described a -general framework that would provide a set of security hooks to control -operations on kernel objects and a set of opaque security fields in -kernel data structures for maintaining security attributes. This -framework could then be used by loadable kernel modules to implement any -desired model of security. Linus also suggested the possibility of -migrating the Linux capabilities code into such a module. - -The Linux Security Modules (LSM) project was started by WireX to develop -such a framework. LSM is a joint development effort by several security -projects, including Immunix, SELinux, SGI and Janus, and several -individuals, including Greg Kroah-Hartman and James Morris, to develop a -Linux kernel patch that implements this framework. The patch is -currently tracking the 2.4 series and is targeted for integration into -the 2.5 development series. This technical report provides an overview -of the framework and the example capabilities security module provided -by the LSM kernel patch. - -LSM Framework -============= - -The LSM kernel patch provides a general kernel framework to support -security modules. In particular, the LSM framework is primarily focused -on supporting access control modules, although future development is -likely to address other security needs such as auditing. By itself, the -framework does not provide any additional security; it merely provides -the infrastructure to support security modules. The LSM kernel patch -also moves most of the capabilities logic into an optional security -module, with the system defaulting to the traditional superuser logic. -This capabilities module is discussed further in -`LSM Capabilities Module <#cap>`__. - -The LSM kernel patch adds security fields to kernel data structures and -inserts calls to hook functions at critical points in the kernel code to -manage the security fields and to perform access control. It also adds -functions for registering and unregistering security modules, and adds a -general :c:func:`security()` system call to support new system calls -for security-aware applications. - -The LSM security fields are simply ``void*`` pointers. For process and -program execution security information, security fields were added to -:c:type:`struct task_struct ` and -:c:type:`struct linux_binprm `. For filesystem -security information, a security field was added to :c:type:`struct -super_block `. For pipe, file, and socket security -information, security fields were added to :c:type:`struct inode -` and :c:type:`struct file `. For packet and -network device security information, security fields were added to -:c:type:`struct sk_buff ` and :c:type:`struct -net_device `. For System V IPC security information, -security fields were added to :c:type:`struct kern_ipc_perm -` and :c:type:`struct msg_msg -`; additionally, the definitions for :c:type:`struct -msg_msg `, struct msg_queue, and struct shmid_kernel -were moved to header files (``include/linux/msg.h`` and -``include/linux/shm.h`` as appropriate) to allow the security modules to -use these definitions. - -Each LSM hook is a function pointer in a global table, security_ops. -This table is a :c:type:`struct security_operations -` structure as defined by -``include/linux/security.h``. Detailed documentation for each hook is -included in this header file. At present, this structure consists of a -collection of substructures that group related hooks based on the kernel -object (e.g. task, inode, file, sk_buff, etc) as well as some top-level -hook function pointers for system operations. This structure is likely -to be flattened in the future for performance. The placement of the hook -calls in the kernel code is described by the "called:" lines in the -per-hook documentation in the header file. The hook calls can also be -easily found in the kernel code by looking for the string -"security_ops->". - -Linus mentioned per-process security hooks in his original remarks as a -possible alternative to global security hooks. However, if LSM were to -start from the perspective of per-process hooks, then the base framework -would have to deal with how to handle operations that involve multiple -processes (e.g. kill), since each process might have its own hook for -controlling the operation. This would require a general mechanism for -composing hooks in the base framework. Additionally, LSM would still -need global hooks for operations that have no process context (e.g. -network input operations). Consequently, LSM provides global security -hooks, but a security module is free to implement per-process hooks -(where that makes sense) by storing a security_ops table in each -process' security field and then invoking these per-process hooks from -the global hooks. The problem of composition is thus deferred to the -module. - -The global security_ops table is initialized to a set of hook functions -provided by a dummy security module that provides traditional superuser -logic. A :c:func:`register_security()` function (in -``security/security.c``) is provided to allow a security module to set -security_ops to refer to its own hook functions, and an -:c:func:`unregister_security()` function is provided to revert -security_ops to the dummy module hooks. This mechanism is used to set -the primary security module, which is responsible for making the final -decision for each hook. - -LSM also provides a simple mechanism for stacking additional security -modules with the primary security module. It defines -:c:func:`register_security()` and -:c:func:`unregister_security()` hooks in the :c:type:`struct -security_operations ` structure and -provides :c:func:`mod_reg_security()` and -:c:func:`mod_unreg_security()` functions that invoke these hooks -after performing some sanity checking. A security module can call these -functions in order to stack with other modules. However, the actual -details of how this stacking is handled are deferred to the module, -which can implement these hooks in any way it wishes (including always -returning an error if it does not wish to support stacking). In this -manner, LSM again defers the problem of composition to the module. - -Although the LSM hooks are organized into substructures based on kernel -object, all of the hooks can be viewed as falling into two major -categories: hooks that are used to manage the security fields and hooks -that are used to perform access control. Examples of the first category -of hooks include the :c:func:`alloc_security()` and -:c:func:`free_security()` hooks defined for each kernel data -structure that has a security field. These hooks are used to allocate -and free security structures for kernel objects. The first category of -hooks also includes hooks that set information in the security field -after allocation, such as the :c:func:`post_lookup()` hook in -:c:type:`struct inode_security_ops `. -This hook is used to set security information for inodes after -successful lookup operations. An example of the second category of hooks -is the :c:func:`permission()` hook in :c:type:`struct -inode_security_ops `. This hook checks -permission when accessing an inode. - -LSM Capabilities Module -======================= - -The LSM kernel patch moves most of the existing POSIX.1e capabilities -logic into an optional security module stored in the file -``security/capability.c``. This change allows users who do not want to -use capabilities to omit this code entirely from their kernel, instead -using the dummy module for traditional superuser logic or any other -module that they desire. This change also allows the developers of the -capabilities logic to maintain and enhance their code more freely, -without needing to integrate patches back into the base kernel. - -In addition to moving the capabilities logic, the LSM kernel patch could -move the capability-related fields from the kernel data structures into -the new security fields managed by the security modules. However, at -present, the LSM kernel patch leaves the capability fields in the kernel -data structures. In his original remarks, Linus suggested that this -might be preferable so that other security modules can be easily stacked -with the capabilities module without needing to chain multiple security -structures on the security field. It also avoids imposing extra overhead -on the capabilities module to manage the security fields. However, the -LSM framework could certainly support such a move if it is determined to -be desirable, with only a few additional changes described below. - -At present, the capabilities logic for computing process capabilities on -:c:func:`execve()` and :c:func:`set\*uid()`, checking -capabilities for a particular process, saving and checking capabilities -for netlink messages, and handling the :c:func:`capget()` and -:c:func:`capset()` system calls have been moved into the -capabilities module. There are still a few locations in the base kernel -where capability-related fields are directly examined or modified, but -the current version of the LSM patch does allow a security module to -completely replace the assignment and testing of capabilities. These few -locations would need to be changed if the capability-related fields were -moved into the security field. The following is a list of known -locations that still perform such direct examination or modification of -capability-related fields: - -- ``fs/open.c``::c:func:`sys_access()` - -- ``fs/lockd/host.c``::c:func:`nlm_bind_host()` - -- ``fs/nfsd/auth.c``::c:func:`nfsd_setuser()` - -- ``fs/proc/array.c``::c:func:`task_cap()` diff --git a/Documentation/security/LSM.rst b/Documentation/security/LSM.rst deleted file mode 100644 index 31d92bc5fdd2..000000000000 --- a/Documentation/security/LSM.rst +++ /dev/null @@ -1,17 +0,0 @@ -================================= -Linux Security Module Development -================================= - -Based on https://lkml.org/lkml/2007/10/26/215, -a new LSM is accepted into the kernel when its intent (a description of -what it tries to protect against and in what cases one would expect to -use it) has been appropriately documented in ``Documentation/admin-guide/LSM/``. -This allows an LSM's code to be easily compared to its goals, and so -that end users and distros can make a more informed decision about which -LSMs suit their requirements. - -For extensive documentation on the available LSM hook interfaces, please -see ``include/linux/lsm_hooks.h`` and associated structures: - -.. kernel-doc:: include/linux/lsm_hooks.h - :internal: diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index aad6d92ffe31..fc503dd689a7 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -8,7 +8,10 @@ Security Documentation credentials IMA-templates keys/index - LSM + lsm + lsm-development + sak SCTP self-protection + siphash tpm/index diff --git a/Documentation/security/lsm-development.rst b/Documentation/security/lsm-development.rst new file mode 100644 index 000000000000..31d92bc5fdd2 --- /dev/null +++ b/Documentation/security/lsm-development.rst @@ -0,0 +1,17 @@ +================================= +Linux Security Module Development +================================= + +Based on https://lkml.org/lkml/2007/10/26/215, +a new LSM is accepted into the kernel when its intent (a description of +what it tries to protect against and in what cases one would expect to +use it) has been appropriately documented in ``Documentation/admin-guide/LSM/``. +This allows an LSM's code to be easily compared to its goals, and so +that end users and distros can make a more informed decision about which +LSMs suit their requirements. + +For extensive documentation on the available LSM hook interfaces, please +see ``include/linux/lsm_hooks.h`` and associated structures: + +.. kernel-doc:: include/linux/lsm_hooks.h + :internal: diff --git a/Documentation/security/lsm.rst b/Documentation/security/lsm.rst new file mode 100644 index 000000000000..ad4dfd020e0d --- /dev/null +++ b/Documentation/security/lsm.rst @@ -0,0 +1,201 @@ +======================================================== +Linux Security Modules: General Security Hooks for Linux +======================================================== + +:Author: Stephen Smalley +:Author: Timothy Fraser +:Author: Chris Vance + +.. note:: + + The APIs described in this book are outdated. + +Introduction +============ + +In March 2001, the National Security Agency (NSA) gave a presentation +about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel Summit. +SELinux is an implementation of flexible and fine-grained +nondiscretionary access controls in the Linux kernel, originally +implemented as its own particular kernel patch. Several other security +projects (e.g. RSBAC, Medusa) have also developed flexible access +control architectures for the Linux kernel, and various projects have +developed particular access control models for Linux (e.g. LIDS, DTE, +SubDomain). Each project has developed and maintained its own kernel +patch to support its security needs. + +In response to the NSA presentation, Linus Torvalds made a set of +remarks that described a security framework he would be willing to +consider for inclusion in the mainstream Linux kernel. He described a +general framework that would provide a set of security hooks to control +operations on kernel objects and a set of opaque security fields in +kernel data structures for maintaining security attributes. This +framework could then be used by loadable kernel modules to implement any +desired model of security. Linus also suggested the possibility of +migrating the Linux capabilities code into such a module. + +The Linux Security Modules (LSM) project was started by WireX to develop +such a framework. LSM is a joint development effort by several security +projects, including Immunix, SELinux, SGI and Janus, and several +individuals, including Greg Kroah-Hartman and James Morris, to develop a +Linux kernel patch that implements this framework. The patch is +currently tracking the 2.4 series and is targeted for integration into +the 2.5 development series. This technical report provides an overview +of the framework and the example capabilities security module provided +by the LSM kernel patch. + +LSM Framework +============= + +The LSM kernel patch provides a general kernel framework to support +security modules. In particular, the LSM framework is primarily focused +on supporting access control modules, although future development is +likely to address other security needs such as auditing. By itself, the +framework does not provide any additional security; it merely provides +the infrastructure to support security modules. The LSM kernel patch +also moves most of the capabilities logic into an optional security +module, with the system defaulting to the traditional superuser logic. +This capabilities module is discussed further in +`LSM Capabilities Module <#cap>`__. + +The LSM kernel patch adds security fields to kernel data structures and +inserts calls to hook functions at critical points in the kernel code to +manage the security fields and to perform access control. It also adds +functions for registering and unregistering security modules, and adds a +general :c:func:`security()` system call to support new system calls +for security-aware applications. + +The LSM security fields are simply ``void*`` pointers. For process and +program execution security information, security fields were added to +:c:type:`struct task_struct ` and +:c:type:`struct linux_binprm `. For filesystem +security information, a security field was added to :c:type:`struct +super_block `. For pipe, file, and socket security +information, security fields were added to :c:type:`struct inode +` and :c:type:`struct file `. For packet and +network device security information, security fields were added to +:c:type:`struct sk_buff ` and :c:type:`struct +net_device `. For System V IPC security information, +security fields were added to :c:type:`struct kern_ipc_perm +` and :c:type:`struct msg_msg +`; additionally, the definitions for :c:type:`struct +msg_msg `, struct msg_queue, and struct shmid_kernel +were moved to header files (``include/linux/msg.h`` and +``include/linux/shm.h`` as appropriate) to allow the security modules to +use these definitions. + +Each LSM hook is a function pointer in a global table, security_ops. +This table is a :c:type:`struct security_operations +` structure as defined by +``include/linux/security.h``. Detailed documentation for each hook is +included in this header file. At present, this structure consists of a +collection of substructures that group related hooks based on the kernel +object (e.g. task, inode, file, sk_buff, etc) as well as some top-level +hook function pointers for system operations. This structure is likely +to be flattened in the future for performance. The placement of the hook +calls in the kernel code is described by the "called:" lines in the +per-hook documentation in the header file. The hook calls can also be +easily found in the kernel code by looking for the string +"security_ops->". + +Linus mentioned per-process security hooks in his original remarks as a +possible alternative to global security hooks. However, if LSM were to +start from the perspective of per-process hooks, then the base framework +would have to deal with how to handle operations that involve multiple +processes (e.g. kill), since each process might have its own hook for +controlling the operation. This would require a general mechanism for +composing hooks in the base framework. Additionally, LSM would still +need global hooks for operations that have no process context (e.g. +network input operations). Consequently, LSM provides global security +hooks, but a security module is free to implement per-process hooks +(where that makes sense) by storing a security_ops table in each +process' security field and then invoking these per-process hooks from +the global hooks. The problem of composition is thus deferred to the +module. + +The global security_ops table is initialized to a set of hook functions +provided by a dummy security module that provides traditional superuser +logic. A :c:func:`register_security()` function (in +``security/security.c``) is provided to allow a security module to set +security_ops to refer to its own hook functions, and an +:c:func:`unregister_security()` function is provided to revert +security_ops to the dummy module hooks. This mechanism is used to set +the primary security module, which is responsible for making the final +decision for each hook. + +LSM also provides a simple mechanism for stacking additional security +modules with the primary security module. It defines +:c:func:`register_security()` and +:c:func:`unregister_security()` hooks in the :c:type:`struct +security_operations ` structure and +provides :c:func:`mod_reg_security()` and +:c:func:`mod_unreg_security()` functions that invoke these hooks +after performing some sanity checking. A security module can call these +functions in order to stack with other modules. However, the actual +details of how this stacking is handled are deferred to the module, +which can implement these hooks in any way it wishes (including always +returning an error if it does not wish to support stacking). In this +manner, LSM again defers the problem of composition to the module. + +Although the LSM hooks are organized into substructures based on kernel +object, all of the hooks can be viewed as falling into two major +categories: hooks that are used to manage the security fields and hooks +that are used to perform access control. Examples of the first category +of hooks include the :c:func:`alloc_security()` and +:c:func:`free_security()` hooks defined for each kernel data +structure that has a security field. These hooks are used to allocate +and free security structures for kernel objects. The first category of +hooks also includes hooks that set information in the security field +after allocation, such as the :c:func:`post_lookup()` hook in +:c:type:`struct inode_security_ops `. +This hook is used to set security information for inodes after +successful lookup operations. An example of the second category of hooks +is the :c:func:`permission()` hook in :c:type:`struct +inode_security_ops `. This hook checks +permission when accessing an inode. + +LSM Capabilities Module +======================= + +The LSM kernel patch moves most of the existing POSIX.1e capabilities +logic into an optional security module stored in the file +``security/capability.c``. This change allows users who do not want to +use capabilities to omit this code entirely from their kernel, instead +using the dummy module for traditional superuser logic or any other +module that they desire. This change also allows the developers of the +capabilities logic to maintain and enhance their code more freely, +without needing to integrate patches back into the base kernel. + +In addition to moving the capabilities logic, the LSM kernel patch could +move the capability-related fields from the kernel data structures into +the new security fields managed by the security modules. However, at +present, the LSM kernel patch leaves the capability fields in the kernel +data structures. In his original remarks, Linus suggested that this +might be preferable so that other security modules can be easily stacked +with the capabilities module without needing to chain multiple security +structures on the security field. It also avoids imposing extra overhead +on the capabilities module to manage the security fields. However, the +LSM framework could certainly support such a move if it is determined to +be desirable, with only a few additional changes described below. + +At present, the capabilities logic for computing process capabilities on +:c:func:`execve()` and :c:func:`set\*uid()`, checking +capabilities for a particular process, saving and checking capabilities +for netlink messages, and handling the :c:func:`capget()` and +:c:func:`capset()` system calls have been moved into the +capabilities module. There are still a few locations in the base kernel +where capability-related fields are directly examined or modified, but +the current version of the LSM patch does allow a security module to +completely replace the assignment and testing of capabilities. These few +locations would need to be changed if the capability-related fields were +moved into the security field. The following is a list of known +locations that still perform such direct examination or modification of +capability-related fields: + +- ``fs/open.c``::c:func:`sys_access()` + +- ``fs/lockd/host.c``::c:func:`nlm_bind_host()` + +- ``fs/nfsd/auth.c``::c:func:`nfsd_setuser()` + +- ``fs/proc/array.c``::c:func:`task_cap()` diff --git a/Documentation/security/sak.rst b/Documentation/security/sak.rst new file mode 100644 index 000000000000..260e1d3687bd --- /dev/null +++ b/Documentation/security/sak.rst @@ -0,0 +1,91 @@ +========================================= +Linux Secure Attention Key (SAK) handling +========================================= + +:Date: 18 March 2001 +:Author: Andrew Morton + +An operating system's Secure Attention Key is a security tool which is +provided as protection against trojan password capturing programs. It +is an undefeatable way of killing all programs which could be +masquerading as login applications. Users need to be taught to enter +this key sequence before they log in to the system. + +From the PC keyboard, Linux has two similar but different ways of +providing SAK. One is the ALT-SYSRQ-K sequence. You shouldn't use +this sequence. It is only available if the kernel was compiled with +sysrq support. + +The proper way of generating a SAK is to define the key sequence using +``loadkeys``. This will work whether or not sysrq support is compiled +into the kernel. + +SAK works correctly when the keyboard is in raw mode. This means that +once defined, SAK will kill a running X server. If the system is in +run level 5, the X server will restart. This is what you want to +happen. + +What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot +the machine. CTRL-ALT-BACKSPACE is magical to the X server. We'll +choose CTRL-ALT-PAUSE. + +In your rc.sysinit (or rc.local) file, add the command:: + + echo "control alt keycode 101 = SAK" | /bin/loadkeys + +And that's it! Only the superuser may reprogram the SAK key. + + +.. note:: + + 1. Linux SAK is said to be not a "true SAK" as is required by + systems which implement C2 level security. This author does not + know why. + + + 2. On the PC keyboard, SAK kills all applications which have + /dev/console opened. + + Unfortunately this includes a number of things which you don't + actually want killed. This is because these applications are + incorrectly holding /dev/console open. Be sure to complain to your + Linux distributor about this! + + You can identify processes which will be killed by SAK with the + command:: + + # ls -l /proc/[0-9]*/fd/* | grep console + l-wx------ 1 root root 64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console + + Then:: + + # ps aux|grep 579 + root 579 0.0 0.1 1088 436 ? S 00:43 0:00 gpm -t ps/2 + + So ``gpm`` will be killed by SAK. This is a bug in gpm. It should + be closing standard input. You can work around this by finding the + initscript which launches gpm and changing it thusly: + + Old:: + + daemon gpm + + New:: + + daemon gpm < /dev/null + + Vixie cron also seems to have this problem, and needs the same treatment. + + Also, one prominent Linux distribution has the following three + lines in its rc.sysinit and rc scripts:: + + exec 3<&0 + exec 4>&1 + exec 5>&2 + + These commands cause **all** daemons which are launched by the + initscripts to have file descriptors 3, 4 and 5 attached to + /dev/console. So SAK kills them all. A workaround is to simply + delete these lines, but this may cause system management + applications to malfunction - test everything well. + diff --git a/Documentation/security/siphash.rst b/Documentation/security/siphash.rst new file mode 100644 index 000000000000..9965821ab333 --- /dev/null +++ b/Documentation/security/siphash.rst @@ -0,0 +1,189 @@ +=========================== +SipHash - a short input PRF +=========================== + +:Author: Written by Jason A. Donenfeld + +SipHash is a cryptographically secure PRF -- a keyed hash function -- that +performs very well for short inputs, hence the name. It was designed by +cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended +as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`, +and so forth. + +SipHash takes a secret key filled with randomly generated numbers and either +an input buffer or several input integers. It spits out an integer that is +indistinguishable from random. You may then use that integer as part of secure +sequence numbers, secure cookies, or mask it off for use in a hash table. + +Generating a key +================ + +Keys should always be generated from a cryptographically secure source of +random numbers, either using get_random_bytes or get_random_once:: + + siphash_key_t key; + get_random_bytes(&key, sizeof(key)); + +If you're not deriving your key from here, you're doing it wrong. + +Using the functions +=================== + +There are two variants of the function, one that takes a list of integers, and +one that takes a buffer:: + + u64 siphash(const void *data, size_t len, const siphash_key_t *key); + +And:: + + u64 siphash_1u64(u64, const siphash_key_t *key); + u64 siphash_2u64(u64, u64, const siphash_key_t *key); + u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key); + u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key); + u64 siphash_1u32(u32, const siphash_key_t *key); + u64 siphash_2u32(u32, u32, const siphash_key_t *key); + u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key); + u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key); + +If you pass the generic siphash function something of a constant length, it +will constant fold at compile-time and automatically choose one of the +optimized functions. + +Hashtable key function usage:: + + struct some_hashtable { + DECLARE_HASHTABLE(hashtable, 8); + siphash_key_t key; + }; + + void init_hashtable(struct some_hashtable *table) + { + get_random_bytes(&table->key, sizeof(table->key)); + } + + static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) + { + return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; + } + +You may then iterate like usual over the returned hash bucket. + +Security +======== + +SipHash has a very high security margin, with its 128-bit key. So long as the +key is kept secret, it is impossible for an attacker to guess the outputs of +the function, even if being able to observe many outputs, since 2^128 outputs +is significant. + +Linux implements the "2-4" variant of SipHash. + +Struct-passing Pitfalls +======================= + +Often times the XuY functions will not be large enough, and instead you'll +want to pass a pre-filled struct to siphash. When doing this, it's important +to always ensure the struct has no padding holes. The easiest way to do this +is to simply arrange the members of the struct in descending order of size, +and to use offsetendof() instead of sizeof() for getting the size. For +performance reasons, if possible, it's probably a good thing to align the +struct to the right boundary. Here's an example:: + + const struct { + struct in6_addr saddr; + u32 counter; + u16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .counter = counter, + .dport = dport + }; + u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret); + +Resources +========= + +Read the SipHash paper if you're interested in learning more: +https://131002.net/siphash/siphash.pdf + +------------------------------------------------------------------------------- + +=============================================== +HalfSipHash - SipHash's insecure younger cousin +=============================================== + +:Author: Written by Jason A. Donenfeld + +On the off-chance that SipHash is not fast enough for your needs, you might be +able to justify using HalfSipHash, a terrifying but potentially useful +possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and, +even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output) +instead of SipHash's 128-bit key. However, this may appeal to some +high-performance `jhash` users. + +Danger! + +Do not ever use HalfSipHash except for as a hashtable key function, and only +then when you can be absolutely certain that the outputs will never be +transmitted out of the kernel. This is only remotely useful over `jhash` as a +means of mitigating hashtable flooding denial of service attacks. + +Generating a key +================ + +Keys should always be generated from a cryptographically secure source of +random numbers, either using get_random_bytes or get_random_once: + +hsiphash_key_t key; +get_random_bytes(&key, sizeof(key)); + +If you're not deriving your key from here, you're doing it wrong. + +Using the functions +=================== + +There are two variants of the function, one that takes a list of integers, and +one that takes a buffer:: + + u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key); + +And:: + + u32 hsiphash_1u32(u32, const hsiphash_key_t *key); + u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key); + u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key); + u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key); + +If you pass the generic hsiphash function something of a constant length, it +will constant fold at compile-time and automatically choose one of the +optimized functions. + +Hashtable key function usage +============================ + +:: + + struct some_hashtable { + DECLARE_HASHTABLE(hashtable, 8); + hsiphash_key_t key; + }; + + void init_hashtable(struct some_hashtable *table) + { + get_random_bytes(&table->key, sizeof(table->key)); + } + + static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) + { + return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; + } + +You may then iterate like usual over the returned hash bucket. + +Performance +=========== + +HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements, +this will not be a problem, as the hashtable lookup isn't the bottleneck. And +in general, this is probably a good sacrifice to make for the security and DoS +resistance of HalfSipHash. diff --git a/Documentation/security/tpm/index.rst b/Documentation/security/tpm/index.rst index af77a7bbb070..3296533e54cf 100644 --- a/Documentation/security/tpm/index.rst +++ b/Documentation/security/tpm/index.rst @@ -5,3 +5,4 @@ Trusted Platform Module documentation .. toctree:: tpm_vtpm_proxy + xen-tpmfront diff --git a/Documentation/security/tpm/xen-tpmfront.rst b/Documentation/security/tpm/xen-tpmfront.rst index 98a16ab87360..00d5b1db227d 100644 --- a/Documentation/security/tpm/xen-tpmfront.rst +++ b/Documentation/security/tpm/xen-tpmfront.rst @@ -1,5 +1,3 @@ -:orphan: - ============================= Virtual TPM interface for Xen ============================= diff --git a/Documentation/siphash.txt b/Documentation/siphash.txt deleted file mode 100644 index 9965821ab333..000000000000 --- a/Documentation/siphash.txt +++ /dev/null @@ -1,189 +0,0 @@ -=========================== -SipHash - a short input PRF -=========================== - -:Author: Written by Jason A. Donenfeld - -SipHash is a cryptographically secure PRF -- a keyed hash function -- that -performs very well for short inputs, hence the name. It was designed by -cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended -as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`, -and so forth. - -SipHash takes a secret key filled with randomly generated numbers and either -an input buffer or several input integers. It spits out an integer that is -indistinguishable from random. You may then use that integer as part of secure -sequence numbers, secure cookies, or mask it off for use in a hash table. - -Generating a key -================ - -Keys should always be generated from a cryptographically secure source of -random numbers, either using get_random_bytes or get_random_once:: - - siphash_key_t key; - get_random_bytes(&key, sizeof(key)); - -If you're not deriving your key from here, you're doing it wrong. - -Using the functions -=================== - -There are two variants of the function, one that takes a list of integers, and -one that takes a buffer:: - - u64 siphash(const void *data, size_t len, const siphash_key_t *key); - -And:: - - u64 siphash_1u64(u64, const siphash_key_t *key); - u64 siphash_2u64(u64, u64, const siphash_key_t *key); - u64 siphash_3u64(u64, u64, u64, const siphash_key_t *key); - u64 siphash_4u64(u64, u64, u64, u64, const siphash_key_t *key); - u64 siphash_1u32(u32, const siphash_key_t *key); - u64 siphash_2u32(u32, u32, const siphash_key_t *key); - u64 siphash_3u32(u32, u32, u32, const siphash_key_t *key); - u64 siphash_4u32(u32, u32, u32, u32, const siphash_key_t *key); - -If you pass the generic siphash function something of a constant length, it -will constant fold at compile-time and automatically choose one of the -optimized functions. - -Hashtable key function usage:: - - struct some_hashtable { - DECLARE_HASHTABLE(hashtable, 8); - siphash_key_t key; - }; - - void init_hashtable(struct some_hashtable *table) - { - get_random_bytes(&table->key, sizeof(table->key)); - } - - static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) - { - return &table->hashtable[siphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; - } - -You may then iterate like usual over the returned hash bucket. - -Security -======== - -SipHash has a very high security margin, with its 128-bit key. So long as the -key is kept secret, it is impossible for an attacker to guess the outputs of -the function, even if being able to observe many outputs, since 2^128 outputs -is significant. - -Linux implements the "2-4" variant of SipHash. - -Struct-passing Pitfalls -======================= - -Often times the XuY functions will not be large enough, and instead you'll -want to pass a pre-filled struct to siphash. When doing this, it's important -to always ensure the struct has no padding holes. The easiest way to do this -is to simply arrange the members of the struct in descending order of size, -and to use offsetendof() instead of sizeof() for getting the size. For -performance reasons, if possible, it's probably a good thing to align the -struct to the right boundary. Here's an example:: - - const struct { - struct in6_addr saddr; - u32 counter; - u16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .counter = counter, - .dport = dport - }; - u64 h = siphash(&combined, offsetofend(typeof(combined), dport), &secret); - -Resources -========= - -Read the SipHash paper if you're interested in learning more: -https://131002.net/siphash/siphash.pdf - -------------------------------------------------------------------------------- - -=============================================== -HalfSipHash - SipHash's insecure younger cousin -=============================================== - -:Author: Written by Jason A. Donenfeld - -On the off-chance that SipHash is not fast enough for your needs, you might be -able to justify using HalfSipHash, a terrifying but potentially useful -possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and, -even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output) -instead of SipHash's 128-bit key. However, this may appeal to some -high-performance `jhash` users. - -Danger! - -Do not ever use HalfSipHash except for as a hashtable key function, and only -then when you can be absolutely certain that the outputs will never be -transmitted out of the kernel. This is only remotely useful over `jhash` as a -means of mitigating hashtable flooding denial of service attacks. - -Generating a key -================ - -Keys should always be generated from a cryptographically secure source of -random numbers, either using get_random_bytes or get_random_once: - -hsiphash_key_t key; -get_random_bytes(&key, sizeof(key)); - -If you're not deriving your key from here, you're doing it wrong. - -Using the functions -=================== - -There are two variants of the function, one that takes a list of integers, and -one that takes a buffer:: - - u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key); - -And:: - - u32 hsiphash_1u32(u32, const hsiphash_key_t *key); - u32 hsiphash_2u32(u32, u32, const hsiphash_key_t *key); - u32 hsiphash_3u32(u32, u32, u32, const hsiphash_key_t *key); - u32 hsiphash_4u32(u32, u32, u32, u32, const hsiphash_key_t *key); - -If you pass the generic hsiphash function something of a constant length, it -will constant fold at compile-time and automatically choose one of the -optimized functions. - -Hashtable key function usage -============================ - -:: - - struct some_hashtable { - DECLARE_HASHTABLE(hashtable, 8); - hsiphash_key_t key; - }; - - void init_hashtable(struct some_hashtable *table) - { - get_random_bytes(&table->key, sizeof(table->key)); - } - - static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input) - { - return &table->hashtable[hsiphash(input, sizeof(*input), &table->key) & (HASH_SIZE(table->hashtable) - 1)]; - } - -You may then iterate like usual over the returned hash bucket. - -Performance -=========== - -HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements, -this will not be a problem, as the hashtable lookup isn't the bottleneck. And -in general, this is probably a good sacrifice to make for the security and DoS -resistance of HalfSipHash. -- cgit From e8d776f20f92b9c679bcdcbdf3aee5026d5265f5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 20 Apr 2019 09:20:52 -0300 Subject: docs: x86: move two x86-specific files to x86 arch dir Those two docs belong to the x86 architecture: Documentation/Intel-IOMMU.txt -> Documentation/x86/intel-iommu.rst Documentation/intel_txt.txt -> Documentation/x86/intel_txt.rst Signed-off-by: Mauro Carvalho Chehab --- Documentation/Intel-IOMMU.txt | 114 ------------------- Documentation/intel_txt.txt | 227 -------------------------------------- Documentation/x86/index.rst | 2 + Documentation/x86/intel-iommu.rst | 114 +++++++++++++++++++ Documentation/x86/intel_txt.rst | 227 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 343 insertions(+), 341 deletions(-) delete mode 100644 Documentation/Intel-IOMMU.txt delete mode 100644 Documentation/intel_txt.txt create mode 100644 Documentation/x86/intel-iommu.rst create mode 100644 Documentation/x86/intel_txt.rst (limited to 'Documentation') diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt deleted file mode 100644 index 9dae6b47e398..000000000000 --- a/Documentation/Intel-IOMMU.txt +++ /dev/null @@ -1,114 +0,0 @@ -=================== -Linux IOMMU Support -=================== - -The architecture spec can be obtained from the below location. - -http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf - -This guide gives a quick cheat sheet for some basic understanding. - -Some Keywords - -- DMAR - DMA remapping -- DRHD - DMA Remapping Hardware Unit Definition -- RMRR - Reserved memory Region Reporting Structure -- ZLR - Zero length reads from PCI devices -- IOVA - IO Virtual address. - -Basic stuff ------------ - -ACPI enumerates and lists the different DMA engines in the platform, and -device scope relationships between PCI devices and which DMA engine controls -them. - -What is RMRR? -------------- - -There are some devices the BIOS controls, for e.g USB devices to perform -PS2 emulation. The regions of memory used for these devices are marked -reserved in the e820 map. When we turn on DMA translation, DMA to those -regions will fail. Hence BIOS uses RMRR to specify these regions along with -devices that need to access these regions. OS is expected to setup -unity mappings for these regions for these devices to access these regions. - -How is IOVA generated? ----------------------- - -Well behaved drivers call pci_map_*() calls before sending command to device -that needs to perform DMA. Once DMA is completed and mapping is no longer -required, device performs a pci_unmap_*() calls to unmap the region. - -The Intel IOMMU driver allocates a virtual address per domain. Each PCIE -device has its own domain (hence protection). Devices under p2p bridges -share the virtual address with all devices under the p2p bridge due to -transaction id aliasing for p2p bridges. - -IOVA generation is pretty generic. We used the same technique as vmalloc() -but these are not global address spaces, but separate for each domain. -Different DMA engines may support different number of domains. - -We also allocate guard pages with each mapping, so we can attempt to catch -any overflow that might happen. - - -Graphics Problems? ------------------- -If you encounter issues with graphics devices, you can try adding -option intel_iommu=igfx_off to turn off the integrated graphics engine. -If this fixes anything, please ensure you file a bug reporting the problem. - -Some exceptions to IOVA ------------------------ -Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff). -The same is true for peer to peer transactions. Hence we reserve the -address from PCI MMIO ranges so they are not allocated for IOVA addresses. - - -Fault reporting ---------------- -When errors are reported, the DMA engine signals via an interrupt. The fault -reason and device that caused it with fault reason is printed on console. - -See below for sample. - - -Boot Message Sample -------------------- - -Something like this gets printed indicating presence of DMAR tables -in ACPI. - -ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0 - -When DMAR is being processed and initialized by ACPI, prints DMAR locations -and any RMRR's processed:: - - ACPI DMAR:Host address width 36 - ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000 - ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000 - ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000 - ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff - ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff - -When DMAR is enabled for use, you will notice.. - -PCI-DMA: Using DMAR IOMMU - -Fault reporting ---------------- - -:: - - DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 - DMAR:[fault reason 05] PTE Write access is not set - DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 - DMAR:[fault reason 05] PTE Write access is not set - -TBD ----- - -- For compatibility testing, could use unity map domain for all devices, just - provide a 1-1 for all useful memory under a single domain for all devices. -- API for paravirt ops for abstracting functionality for VMM folks. diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt deleted file mode 100644 index d83c1a2122c9..000000000000 --- a/Documentation/intel_txt.txt +++ /dev/null @@ -1,227 +0,0 @@ -===================== -Intel(R) TXT Overview -===================== - -Intel's technology for safer computing, Intel(R) Trusted Execution -Technology (Intel(R) TXT), defines platform-level enhancements that -provide the building blocks for creating trusted platforms. - -Intel TXT was formerly known by the code name LaGrande Technology (LT). - -Intel TXT in Brief: - -- Provides dynamic root of trust for measurement (DRTM) -- Data protection in case of improper shutdown -- Measurement and verification of launched environment - -Intel TXT is part of the vPro(TM) brand and is also available some -non-vPro systems. It is currently available on desktop systems -based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell -Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, -PM45, and GS45 Express chipsets. - -For more information, see http://www.intel.com/technology/security/. -This site also has a link to the Intel TXT MLE Developers Manual, -which has been updated for the new released platforms. - -Intel TXT has been presented at various events over the past few -years, some of which are: - - - LinuxTAG 2008: - http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag.html - - - TRUST2008: - http://www.trust-conference.eu/downloads/Keynote-Speakers/ - 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf - - - IDF, Shanghai: - http://www.prcidf.com.cn/index_en.html - - - IDFs 2006, 2007 - (I'm not sure if/where they are online) - -Trusted Boot Project Overview -============================= - -Trusted Boot (tboot) is an open source, pre-kernel/VMM module that -uses Intel TXT to perform a measured and verified launch of an OS -kernel/VMM. - -It is hosted on SourceForge at http://sourceforge.net/projects/tboot. -The mercurial source repo is available at http://www.bughost.org/ -repos.hg/tboot.hg. - -Tboot currently supports launching Xen (open source VMM/hypervisor -w/ TXT support since v3.2), and now Linux kernels. - - -Value Proposition for Linux or "Why should you care?" -===================================================== - -While there are many products and technologies that attempt to -measure or protect the integrity of a running kernel, they all -assume the kernel is "good" to begin with. The Integrity -Measurement Architecture (IMA) and Linux Integrity Module interface -are examples of such solutions. - -To get trust in the initial kernel without using Intel TXT, a -static root of trust must be used. This bases trust in BIOS -starting at system reset and requires measurement of all code -executed between system reset through the completion of the kernel -boot as well as data objects used by that code. In the case of a -Linux kernel, this means all of BIOS, any option ROMs, the -bootloader and the boot config. In practice, this is a lot of -code/data, much of which is subject to change from boot to boot -(e.g. changing NICs may change option ROMs). Without reference -hashes, these measurement changes are difficult to assess or -confirm as benign. This process also does not provide DMA -protection, memory configuration/alias checks and locks, crash -protection, or policy support. - -By using the hardware-based root of trust that Intel TXT provides, -many of these issues can be mitigated. Specifically: many -pre-launch components can be removed from the trust chain, DMA -protection is provided to all launched components, a large number -of platform configuration checks are performed and values locked, -protection is provided for any data in the event of an improper -shutdown, and there is support for policy-based execution/verification. -This provides a more stable measurement and a higher assurance of -system configuration and initial state than would be otherwise -possible. Since the tboot project is open source, source code for -almost all parts of the trust chain is available (excepting SMM and -Intel-provided firmware). - -How Does it Work? -================= - -- Tboot is an executable that is launched by the bootloader as - the "kernel" (the binary the bootloader executes). -- It performs all of the work necessary to determine if the - platform supports Intel TXT and, if so, executes the GETSEC[SENTER] - processor instruction that initiates the dynamic root of trust. - - - If tboot determines that the system does not support Intel TXT - or is not configured correctly (e.g. the SINIT AC Module was - incorrect), it will directly launch the kernel with no changes - to any state. - - Tboot will output various information about its progress to the - terminal, serial port, and/or an in-memory log; the output - locations can be configured with a command line switch. - -- The GETSEC[SENTER] instruction will return control to tboot and - tboot then verifies certain aspects of the environment (e.g. TPM NV - lock, e820 table does not have invalid entries, etc.). -- It will wake the APs from the special sleep state the GETSEC[SENTER] - instruction had put them in and place them into a wait-for-SIPI - state. - - - Because the processors will not respond to an INIT or SIPI when - in the TXT environment, it is necessary to create a small VT-x - guest for the APs. When they run in this guest, they will - simply wait for the INIT-SIPI-SIPI sequence, which will cause - VMEXITs, and then disable VT and jump to the SIPI vector. This - approach seemed like a better choice than having to insert - special code into the kernel's MP wakeup sequence. - -- Tboot then applies an (optional) user-defined launch policy to - verify the kernel and initrd. - - - This policy is rooted in TPM NV and is described in the tboot - project. The tboot project also contains code for tools to - create and provision the policy. - - Policies are completely under user control and if not present - then any kernel will be launched. - - Policy action is flexible and can include halting on failures - or simply logging them and continuing. - -- Tboot adjusts the e820 table provided by the bootloader to reserve - its own location in memory as well as to reserve certain other - TXT-related regions. -- As part of its launch, tboot DMA protects all of RAM (using the - VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' - in order to remove this blanket protection and use VT-d's - page-level protection. -- Tboot will populate a shared page with some data about itself and - pass this to the Linux kernel as it transfers control. - - - The location of the shared page is passed via the boot_params - struct as a physical address. - -- The kernel will look for the tboot shared page address and, if it - exists, map it. -- As one of the checks/protections provided by TXT, it makes a copy - of the VT-d DMARs in a DMA-protected region of memory and verifies - them for correctness. The VT-d code will detect if the kernel was - launched with tboot and use this copy instead of the one in the - ACPI table. -- At this point, tboot and TXT are out of the picture until a - shutdown (S) -- In order to put a system into any of the sleep states after a TXT - launch, TXT must first be exited. This is to prevent attacks that - attempt to crash the system to gain control on reboot and steal - data left in memory. - - - The kernel will perform all of its sleep preparation and - populate the shared page with the ACPI data needed to put the - platform in the desired sleep state. - - Then the kernel jumps into tboot via the vector specified in the - shared page. - - Tboot will clean up the environment and disable TXT, then use the - kernel-provided ACPI information to actually place the platform - into the desired sleep state. - - In the case of S3, tboot will also register itself as the resume - vector. This is necessary because it must re-establish the - measured environment upon resume. Once the TXT environment - has been restored, it will restore the TPM PCRs and then - transfer control back to the kernel's S3 resume vector. - In order to preserve system integrity across S3, the kernel - provides tboot with a set of memory ranges (RAM and RESERVED_KERN - in the e820 table, but not any memory that BIOS might alter over - the S3 transition) that tboot will calculate a MAC (message - authentication code) over and then seal with the TPM. On resume - and once the measured environment has been re-established, tboot - will re-calculate the MAC and verify it against the sealed value. - Tboot's policy determines what happens if the verification fails. - Note that the c/s 194 of tboot which has the new MAC code supports - this. - -That's pretty much it for TXT support. - - -Configuring the System -====================== - -This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels. - -In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes -allow these to be individually enabled/disabled and the screens in -which to find them are BIOS-specific. - -grub.conf needs to be modified as follows:: - - title Linux 2.6.29-tip w/ tboot - root (hd0,0) - kernel /tboot.gz logging=serial,vga,memory - module /vmlinuz-2.6.29-tip intel_iommu=on ro - root=LABEL=/ rhgb console=ttyS0,115200 3 - module /initrd-2.6.29-tip.img - module /Q35_SINIT_17.BIN - -The kernel option for enabling Intel TXT support is found under the -Security top-level menu and is called "Enable Intel(R) Trusted -Execution Technology (TXT)". It is considered EXPERIMENTAL and -depends on the generic x86 support (to allow maximum flexibility in -kernel build options), since the tboot code will detect whether the -platform actually supports Intel TXT and thus whether any of the -kernel code is executed. - -The Q35_SINIT_17.BIN file is what Intel TXT refers to as an -Authenticated Code Module. It is specific to the chipset in the -system and can also be found on the Trusted Boot site. It is an -(unencrypted) module signed by Intel that is used as part of the -DRTM process to verify and configure the system. It is signed -because it operates at a higher privilege level in the system than -any other macrocode and its correct operation is critical to the -establishment of the DRTM. The process for determining the correct -SINIT ACM for a system is documented in the SINIT-guide.txt file -that is on the tboot SourceForge site under the SINIT ACM downloads. diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index f2de1b2d3ac7..af64c4bb4447 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -20,6 +20,8 @@ x86-specific Documentation mtrr pat intel_mpx + intel-iommu + intel_txt amd-memory-encryption pti mds diff --git a/Documentation/x86/intel-iommu.rst b/Documentation/x86/intel-iommu.rst new file mode 100644 index 000000000000..9dae6b47e398 --- /dev/null +++ b/Documentation/x86/intel-iommu.rst @@ -0,0 +1,114 @@ +=================== +Linux IOMMU Support +=================== + +The architecture spec can be obtained from the below location. + +http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf + +This guide gives a quick cheat sheet for some basic understanding. + +Some Keywords + +- DMAR - DMA remapping +- DRHD - DMA Remapping Hardware Unit Definition +- RMRR - Reserved memory Region Reporting Structure +- ZLR - Zero length reads from PCI devices +- IOVA - IO Virtual address. + +Basic stuff +----------- + +ACPI enumerates and lists the different DMA engines in the platform, and +device scope relationships between PCI devices and which DMA engine controls +them. + +What is RMRR? +------------- + +There are some devices the BIOS controls, for e.g USB devices to perform +PS2 emulation. The regions of memory used for these devices are marked +reserved in the e820 map. When we turn on DMA translation, DMA to those +regions will fail. Hence BIOS uses RMRR to specify these regions along with +devices that need to access these regions. OS is expected to setup +unity mappings for these regions for these devices to access these regions. + +How is IOVA generated? +---------------------- + +Well behaved drivers call pci_map_*() calls before sending command to device +that needs to perform DMA. Once DMA is completed and mapping is no longer +required, device performs a pci_unmap_*() calls to unmap the region. + +The Intel IOMMU driver allocates a virtual address per domain. Each PCIE +device has its own domain (hence protection). Devices under p2p bridges +share the virtual address with all devices under the p2p bridge due to +transaction id aliasing for p2p bridges. + +IOVA generation is pretty generic. We used the same technique as vmalloc() +but these are not global address spaces, but separate for each domain. +Different DMA engines may support different number of domains. + +We also allocate guard pages with each mapping, so we can attempt to catch +any overflow that might happen. + + +Graphics Problems? +------------------ +If you encounter issues with graphics devices, you can try adding +option intel_iommu=igfx_off to turn off the integrated graphics engine. +If this fixes anything, please ensure you file a bug reporting the problem. + +Some exceptions to IOVA +----------------------- +Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff). +The same is true for peer to peer transactions. Hence we reserve the +address from PCI MMIO ranges so they are not allocated for IOVA addresses. + + +Fault reporting +--------------- +When errors are reported, the DMA engine signals via an interrupt. The fault +reason and device that caused it with fault reason is printed on console. + +See below for sample. + + +Boot Message Sample +------------------- + +Something like this gets printed indicating presence of DMAR tables +in ACPI. + +ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0 + +When DMAR is being processed and initialized by ACPI, prints DMAR locations +and any RMRR's processed:: + + ACPI DMAR:Host address width 36 + ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000 + ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000 + ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000 + ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff + ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff + +When DMAR is enabled for use, you will notice.. + +PCI-DMA: Using DMAR IOMMU + +Fault reporting +--------------- + +:: + + DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 + DMAR:[fault reason 05] PTE Write access is not set + DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 + DMAR:[fault reason 05] PTE Write access is not set + +TBD +---- + +- For compatibility testing, could use unity map domain for all devices, just + provide a 1-1 for all useful memory under a single domain for all devices. +- API for paravirt ops for abstracting functionality for VMM folks. diff --git a/Documentation/x86/intel_txt.rst b/Documentation/x86/intel_txt.rst new file mode 100644 index 000000000000..d83c1a2122c9 --- /dev/null +++ b/Documentation/x86/intel_txt.rst @@ -0,0 +1,227 @@ +===================== +Intel(R) TXT Overview +===================== + +Intel's technology for safer computing, Intel(R) Trusted Execution +Technology (Intel(R) TXT), defines platform-level enhancements that +provide the building blocks for creating trusted platforms. + +Intel TXT was formerly known by the code name LaGrande Technology (LT). + +Intel TXT in Brief: + +- Provides dynamic root of trust for measurement (DRTM) +- Data protection in case of improper shutdown +- Measurement and verification of launched environment + +Intel TXT is part of the vPro(TM) brand and is also available some +non-vPro systems. It is currently available on desktop systems +based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell +Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, +PM45, and GS45 Express chipsets. + +For more information, see http://www.intel.com/technology/security/. +This site also has a link to the Intel TXT MLE Developers Manual, +which has been updated for the new released platforms. + +Intel TXT has been presented at various events over the past few +years, some of which are: + + - LinuxTAG 2008: + http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag.html + + - TRUST2008: + http://www.trust-conference.eu/downloads/Keynote-Speakers/ + 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf + + - IDF, Shanghai: + http://www.prcidf.com.cn/index_en.html + + - IDFs 2006, 2007 + (I'm not sure if/where they are online) + +Trusted Boot Project Overview +============================= + +Trusted Boot (tboot) is an open source, pre-kernel/VMM module that +uses Intel TXT to perform a measured and verified launch of an OS +kernel/VMM. + +It is hosted on SourceForge at http://sourceforge.net/projects/tboot. +The mercurial source repo is available at http://www.bughost.org/ +repos.hg/tboot.hg. + +Tboot currently supports launching Xen (open source VMM/hypervisor +w/ TXT support since v3.2), and now Linux kernels. + + +Value Proposition for Linux or "Why should you care?" +===================================================== + +While there are many products and technologies that attempt to +measure or protect the integrity of a running kernel, they all +assume the kernel is "good" to begin with. The Integrity +Measurement Architecture (IMA) and Linux Integrity Module interface +are examples of such solutions. + +To get trust in the initial kernel without using Intel TXT, a +static root of trust must be used. This bases trust in BIOS +starting at system reset and requires measurement of all code +executed between system reset through the completion of the kernel +boot as well as data objects used by that code. In the case of a +Linux kernel, this means all of BIOS, any option ROMs, the +bootloader and the boot config. In practice, this is a lot of +code/data, much of which is subject to change from boot to boot +(e.g. changing NICs may change option ROMs). Without reference +hashes, these measurement changes are difficult to assess or +confirm as benign. This process also does not provide DMA +protection, memory configuration/alias checks and locks, crash +protection, or policy support. + +By using the hardware-based root of trust that Intel TXT provides, +many of these issues can be mitigated. Specifically: many +pre-launch components can be removed from the trust chain, DMA +protection is provided to all launched components, a large number +of platform configuration checks are performed and values locked, +protection is provided for any data in the event of an improper +shutdown, and there is support for policy-based execution/verification. +This provides a more stable measurement and a higher assurance of +system configuration and initial state than would be otherwise +possible. Since the tboot project is open source, source code for +almost all parts of the trust chain is available (excepting SMM and +Intel-provided firmware). + +How Does it Work? +================= + +- Tboot is an executable that is launched by the bootloader as + the "kernel" (the binary the bootloader executes). +- It performs all of the work necessary to determine if the + platform supports Intel TXT and, if so, executes the GETSEC[SENTER] + processor instruction that initiates the dynamic root of trust. + + - If tboot determines that the system does not support Intel TXT + or is not configured correctly (e.g. the SINIT AC Module was + incorrect), it will directly launch the kernel with no changes + to any state. + - Tboot will output various information about its progress to the + terminal, serial port, and/or an in-memory log; the output + locations can be configured with a command line switch. + +- The GETSEC[SENTER] instruction will return control to tboot and + tboot then verifies certain aspects of the environment (e.g. TPM NV + lock, e820 table does not have invalid entries, etc.). +- It will wake the APs from the special sleep state the GETSEC[SENTER] + instruction had put them in and place them into a wait-for-SIPI + state. + + - Because the processors will not respond to an INIT or SIPI when + in the TXT environment, it is necessary to create a small VT-x + guest for the APs. When they run in this guest, they will + simply wait for the INIT-SIPI-SIPI sequence, which will cause + VMEXITs, and then disable VT and jump to the SIPI vector. This + approach seemed like a better choice than having to insert + special code into the kernel's MP wakeup sequence. + +- Tboot then applies an (optional) user-defined launch policy to + verify the kernel and initrd. + + - This policy is rooted in TPM NV and is described in the tboot + project. The tboot project also contains code for tools to + create and provision the policy. + - Policies are completely under user control and if not present + then any kernel will be launched. + - Policy action is flexible and can include halting on failures + or simply logging them and continuing. + +- Tboot adjusts the e820 table provided by the bootloader to reserve + its own location in memory as well as to reserve certain other + TXT-related regions. +- As part of its launch, tboot DMA protects all of RAM (using the + VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' + in order to remove this blanket protection and use VT-d's + page-level protection. +- Tboot will populate a shared page with some data about itself and + pass this to the Linux kernel as it transfers control. + + - The location of the shared page is passed via the boot_params + struct as a physical address. + +- The kernel will look for the tboot shared page address and, if it + exists, map it. +- As one of the checks/protections provided by TXT, it makes a copy + of the VT-d DMARs in a DMA-protected region of memory and verifies + them for correctness. The VT-d code will detect if the kernel was + launched with tboot and use this copy instead of the one in the + ACPI table. +- At this point, tboot and TXT are out of the picture until a + shutdown (S) +- In order to put a system into any of the sleep states after a TXT + launch, TXT must first be exited. This is to prevent attacks that + attempt to crash the system to gain control on reboot and steal + data left in memory. + + - The kernel will perform all of its sleep preparation and + populate the shared page with the ACPI data needed to put the + platform in the desired sleep state. + - Then the kernel jumps into tboot via the vector specified in the + shared page. + - Tboot will clean up the environment and disable TXT, then use the + kernel-provided ACPI information to actually place the platform + into the desired sleep state. + - In the case of S3, tboot will also register itself as the resume + vector. This is necessary because it must re-establish the + measured environment upon resume. Once the TXT environment + has been restored, it will restore the TPM PCRs and then + transfer control back to the kernel's S3 resume vector. + In order to preserve system integrity across S3, the kernel + provides tboot with a set of memory ranges (RAM and RESERVED_KERN + in the e820 table, but not any memory that BIOS might alter over + the S3 transition) that tboot will calculate a MAC (message + authentication code) over and then seal with the TPM. On resume + and once the measured environment has been re-established, tboot + will re-calculate the MAC and verify it against the sealed value. + Tboot's policy determines what happens if the verification fails. + Note that the c/s 194 of tboot which has the new MAC code supports + this. + +That's pretty much it for TXT support. + + +Configuring the System +====================== + +This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels. + +In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes +allow these to be individually enabled/disabled and the screens in +which to find them are BIOS-specific. + +grub.conf needs to be modified as follows:: + + title Linux 2.6.29-tip w/ tboot + root (hd0,0) + kernel /tboot.gz logging=serial,vga,memory + module /vmlinuz-2.6.29-tip intel_iommu=on ro + root=LABEL=/ rhgb console=ttyS0,115200 3 + module /initrd-2.6.29-tip.img + module /Q35_SINIT_17.BIN + +The kernel option for enabling Intel TXT support is found under the +Security top-level menu and is called "Enable Intel(R) Trusted +Execution Technology (TXT)". It is considered EXPERIMENTAL and +depends on the generic x86 support (to allow maximum flexibility in +kernel build options), since the tboot code will detect whether the +platform actually supports Intel TXT and thus whether any of the +kernel code is executed. + +The Q35_SINIT_17.BIN file is what Intel TXT refers to as an +Authenticated Code Module. It is specific to the chipset in the +system and can also be found on the Trusted Boot site. It is an +(unencrypted) module signed by Intel that is used as part of the +DRTM process to verify and configure the system. It is signed +because it operates at a higher privilege level in the system than +any other macrocode and its correct operation is critical to the +establishment of the DRTM. The process for determining the correct +SINIT ACM for a system is documented in the SINIT-guide.txt file +that is on the tboot SourceForge site under the SINIT ACM downloads. -- cgit From 2dbc0838bcf24ca59cabc3130cf3b1d6809cdcd4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 11:39:21 -0300 Subject: docs: ocxl.rst: add it to the uAPI book The content of this file is user-faced. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andrew Donnellan --- Documentation/accelerators/ocxl.rst | 178 ---------------------- Documentation/userspace-api/accelerators/ocxl.rst | 176 +++++++++++++++++++++ Documentation/userspace-api/index.rst | 1 + 3 files changed, 177 insertions(+), 178 deletions(-) delete mode 100644 Documentation/accelerators/ocxl.rst create mode 100644 Documentation/userspace-api/accelerators/ocxl.rst (limited to 'Documentation') diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst deleted file mode 100644 index b1cea19a90f5..000000000000 --- a/Documentation/accelerators/ocxl.rst +++ /dev/null @@ -1,178 +0,0 @@ -:orphan: - -======================================================== -OpenCAPI (Open Coherent Accelerator Processor Interface) -======================================================== - -OpenCAPI is an interface between processors and accelerators. It aims -at being low-latency and high-bandwidth. The specification is -developed by the `OpenCAPI Consortium `_. - -It allows an accelerator (which could be a FPGA, ASICs, ...) to access -the host memory coherently, using virtual addresses. An OpenCAPI -device can also host its own memory, that can be accessed from the -host. - -OpenCAPI is known in linux as 'ocxl', as the open, processor-agnostic -evolution of 'cxl' (the driver for the IBM CAPI interface for -powerpc), which was named that way to avoid confusion with the ISDN -CAPI subsystem. - - -High-level view -=============== - -OpenCAPI defines a Data Link Layer (DL) and Transaction Layer (TL), to -be implemented on top of a physical link. Any processor or device -implementing the DL and TL can start sharing memory. - -:: - - +-----------+ +-------------+ - | | | | - | | | Accelerated | - | Processor | | Function | - | | +--------+ | Unit | +--------+ - | |--| Memory | | (AFU) |--| Memory | - | | +--------+ | | +--------+ - +-----------+ +-------------+ - | | - +-----------+ +-------------+ - | TL | | TLX | - +-----------+ +-------------+ - | | - +-----------+ +-------------+ - | DL | | DLX | - +-----------+ +-------------+ - | | - | PHY | - +---------------------------------------+ - - - -Device discovery -================ - -OpenCAPI relies on a PCI-like configuration space, implemented on the -device. So the host can discover AFUs by querying the config space. - -OpenCAPI devices in Linux are treated like PCI devices (with a few -caveats). The firmware is expected to abstract the hardware as if it -was a PCI link. A lot of the existing PCI infrastructure is reused: -devices are scanned and BARs are assigned during the standard PCI -enumeration. Commands like 'lspci' can therefore be used to see what -devices are available. - -The configuration space defines the AFU(s) that can be found on the -physical adapter, such as its name, how many memory contexts it can -work with, the size of its MMIO areas, ... - - - -MMIO -==== - -OpenCAPI defines two MMIO areas for each AFU: - -* the global MMIO area, with registers pertinent to the whole AFU. -* a per-process MMIO area, which has a fixed size for each context. - - - -AFU interrupts -============== - -OpenCAPI includes the possibility for an AFU to send an interrupt to a -host process. It is done through a 'intrp_req' defined in the -Transaction Layer, specifying a 64-bit object handle which defines the -interrupt. - -The driver allows a process to allocate an interrupt and obtain its -64-bit object handle, that can be passed to the AFU. - - - -char devices -============ - -The driver creates one char device per AFU found on the physical -device. A physical device may have multiple functions and each -function can have multiple AFUs. At the time of this writing though, -it has only been tested with devices exporting only one AFU. - -Char devices can be found in /dev/ocxl/ and are named as: -/dev/ocxl/.. - -where is a max 20-character long name, as found in the -config space of the AFU. - is added by the driver and can help distinguish devices -when a system has more than one instance of the same OpenCAPI device. - is also to help distinguish AFUs in the unlikely case where a -device carries multiple copies of the same AFU. - - - -Sysfs class -=========== - -An ocxl class is added for the devices representing the AFUs. See -/sys/class/ocxl. The layout is described in -Documentation/ABI/testing/sysfs-class-ocxl - - - -User API -======== - -open ----- - -Based on the AFU definition found in the config space, an AFU may -support working with more than one memory context, in which case the -associated char device may be opened multiple times by different -processes. - - -ioctl ------ - -OCXL_IOCTL_ATTACH: - - Attach the memory context of the calling process to the AFU so that - the AFU can access its memory. - -OCXL_IOCTL_IRQ_ALLOC: - - Allocate an AFU interrupt and return an identifier. - -OCXL_IOCTL_IRQ_FREE: - - Free a previously allocated AFU interrupt. - -OCXL_IOCTL_IRQ_SET_FD: - - Associate an event fd to an AFU interrupt so that the user process - can be notified when the AFU sends an interrupt. - -OCXL_IOCTL_GET_METADATA: - - Obtains configuration information from the card, such at the size of - MMIO areas, the AFU version, and the PASID for the current context. - -OCXL_IOCTL_ENABLE_P9_WAIT: - - Allows the AFU to wake a userspace thread executing 'wait'. Returns - information to userspace to allow it to configure the AFU. Note that - this is only available on POWER9. - -OCXL_IOCTL_GET_FEATURES: - - Reports on which CPU features that affect OpenCAPI are usable from - userspace. - - -mmap ----- - -A process can mmap the per-process MMIO area for interactions with the -AFU. diff --git a/Documentation/userspace-api/accelerators/ocxl.rst b/Documentation/userspace-api/accelerators/ocxl.rst new file mode 100644 index 000000000000..14cefc020e2d --- /dev/null +++ b/Documentation/userspace-api/accelerators/ocxl.rst @@ -0,0 +1,176 @@ +======================================================== +OpenCAPI (Open Coherent Accelerator Processor Interface) +======================================================== + +OpenCAPI is an interface between processors and accelerators. It aims +at being low-latency and high-bandwidth. The specification is +developed by the `OpenCAPI Consortium `_. + +It allows an accelerator (which could be a FPGA, ASICs, ...) to access +the host memory coherently, using virtual addresses. An OpenCAPI +device can also host its own memory, that can be accessed from the +host. + +OpenCAPI is known in linux as 'ocxl', as the open, processor-agnostic +evolution of 'cxl' (the driver for the IBM CAPI interface for +powerpc), which was named that way to avoid confusion with the ISDN +CAPI subsystem. + + +High-level view +=============== + +OpenCAPI defines a Data Link Layer (DL) and Transaction Layer (TL), to +be implemented on top of a physical link. Any processor or device +implementing the DL and TL can start sharing memory. + +:: + + +-----------+ +-------------+ + | | | | + | | | Accelerated | + | Processor | | Function | + | | +--------+ | Unit | +--------+ + | |--| Memory | | (AFU) |--| Memory | + | | +--------+ | | +--------+ + +-----------+ +-------------+ + | | + +-----------+ +-------------+ + | TL | | TLX | + +-----------+ +-------------+ + | | + +-----------+ +-------------+ + | DL | | DLX | + +-----------+ +-------------+ + | | + | PHY | + +---------------------------------------+ + + + +Device discovery +================ + +OpenCAPI relies on a PCI-like configuration space, implemented on the +device. So the host can discover AFUs by querying the config space. + +OpenCAPI devices in Linux are treated like PCI devices (with a few +caveats). The firmware is expected to abstract the hardware as if it +was a PCI link. A lot of the existing PCI infrastructure is reused: +devices are scanned and BARs are assigned during the standard PCI +enumeration. Commands like 'lspci' can therefore be used to see what +devices are available. + +The configuration space defines the AFU(s) that can be found on the +physical adapter, such as its name, how many memory contexts it can +work with, the size of its MMIO areas, ... + + + +MMIO +==== + +OpenCAPI defines two MMIO areas for each AFU: + +* the global MMIO area, with registers pertinent to the whole AFU. +* a per-process MMIO area, which has a fixed size for each context. + + + +AFU interrupts +============== + +OpenCAPI includes the possibility for an AFU to send an interrupt to a +host process. It is done through a 'intrp_req' defined in the +Transaction Layer, specifying a 64-bit object handle which defines the +interrupt. + +The driver allows a process to allocate an interrupt and obtain its +64-bit object handle, that can be passed to the AFU. + + + +char devices +============ + +The driver creates one char device per AFU found on the physical +device. A physical device may have multiple functions and each +function can have multiple AFUs. At the time of this writing though, +it has only been tested with devices exporting only one AFU. + +Char devices can be found in /dev/ocxl/ and are named as: +/dev/ocxl/.. + +where is a max 20-character long name, as found in the +config space of the AFU. + is added by the driver and can help distinguish devices +when a system has more than one instance of the same OpenCAPI device. + is also to help distinguish AFUs in the unlikely case where a +device carries multiple copies of the same AFU. + + + +Sysfs class +=========== + +An ocxl class is added for the devices representing the AFUs. See +/sys/class/ocxl. The layout is described in +Documentation/ABI/testing/sysfs-class-ocxl + + + +User API +======== + +open +---- + +Based on the AFU definition found in the config space, an AFU may +support working with more than one memory context, in which case the +associated char device may be opened multiple times by different +processes. + + +ioctl +----- + +OCXL_IOCTL_ATTACH: + + Attach the memory context of the calling process to the AFU so that + the AFU can access its memory. + +OCXL_IOCTL_IRQ_ALLOC: + + Allocate an AFU interrupt and return an identifier. + +OCXL_IOCTL_IRQ_FREE: + + Free a previously allocated AFU interrupt. + +OCXL_IOCTL_IRQ_SET_FD: + + Associate an event fd to an AFU interrupt so that the user process + can be notified when the AFU sends an interrupt. + +OCXL_IOCTL_GET_METADATA: + + Obtains configuration information from the card, such at the size of + MMIO areas, the AFU version, and the PASID for the current context. + +OCXL_IOCTL_ENABLE_P9_WAIT: + + Allows the AFU to wake a userspace thread executing 'wait'. Returns + information to userspace to allow it to configure the AFU. Note that + this is only available on POWER9. + +OCXL_IOCTL_GET_FEATURES: + + Reports on which CPU features that affect OpenCAPI are usable from + userspace. + + +mmap +---- + +A process can mmap the per-process MMIO area for interactions with the +AFU. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index a3233da7fa88..ad494da40009 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -20,6 +20,7 @@ place where this information is gathered. seccomp_filter unshare spec_ctrl + accelerators/ocxl .. only:: subproject and html -- cgit From 56198359b64125dd0f9fa991972b61e4bc4fc6b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 11:44:24 -0300 Subject: docs: lp855x-driver.rst: add it to the driver-api book The content of this file is intended for backlight Kernel developers. Signed-off-by: Mauro Carvalho Chehab --- Documentation/backlight/lp855x-driver.rst | 83 ---------------------- .../driver-api/backlight/lp855x-driver.rst | 81 +++++++++++++++++++++ Documentation/driver-api/index.rst | 1 + 3 files changed, 82 insertions(+), 83 deletions(-) delete mode 100644 Documentation/backlight/lp855x-driver.rst create mode 100644 Documentation/driver-api/backlight/lp855x-driver.rst (limited to 'Documentation') diff --git a/Documentation/backlight/lp855x-driver.rst b/Documentation/backlight/lp855x-driver.rst deleted file mode 100644 index 62b7ed847a77..000000000000 --- a/Documentation/backlight/lp855x-driver.rst +++ /dev/null @@ -1,83 +0,0 @@ -:orphan: - -==================== -Kernel driver lp855x -==================== - -Backlight driver for LP855x ICs - -Supported chips: - - Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and - LP8557 - -Author: Milo(Woogyom) Kim - -Description ------------ - -* Brightness control - - Brightness can be controlled by the pwm input or the i2c command. - The lp855x driver supports both cases. - -* Device attributes - - 1) bl_ctl_mode - - Backlight control mode. - - Value: pwm based or register based - - 2) chip_id - - The lp855x chip id. - - Value: lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557 - -Platform data for lp855x ------------------------- - -For supporting platform specific data, the lp855x platform data can be used. - -* name: - Backlight driver name. If it is not defined, default name is set. -* device_control: - Value of DEVICE CONTROL register. -* initial_brightness: - Initial value of backlight brightness. -* period_ns: - Platform specific PWM period value. unit is nano. - Only valid when brightness is pwm input mode. -* size_program: - Total size of lp855x_rom_data. -* rom_data: - List of new eeprom/eprom registers. - -Examples -======== - -1) lp8552 platform data: i2c register mode with new eeprom data:: - - #define EEPROM_A5_ADDR 0xA5 - #define EEPROM_A5_VAL 0x4f /* EN_VSYNC=0 */ - - static struct lp855x_rom_data lp8552_eeprom_arr[] = { - {EEPROM_A5_ADDR, EEPROM_A5_VAL}, - }; - - static struct lp855x_platform_data lp8552_pdata = { - .name = "lcd-bl", - .device_control = I2C_CONFIG(LP8552), - .initial_brightness = INITIAL_BRT, - .size_program = ARRAY_SIZE(lp8552_eeprom_arr), - .rom_data = lp8552_eeprom_arr, - }; - -2) lp8556 platform data: pwm input mode with default rom data:: - - static struct lp855x_platform_data lp8556_pdata = { - .device_control = PWM_CONFIG(LP8556), - .initial_brightness = INITIAL_BRT, - .period_ns = 1000000, - }; diff --git a/Documentation/driver-api/backlight/lp855x-driver.rst b/Documentation/driver-api/backlight/lp855x-driver.rst new file mode 100644 index 000000000000..1e0b224fc397 --- /dev/null +++ b/Documentation/driver-api/backlight/lp855x-driver.rst @@ -0,0 +1,81 @@ +==================== +Kernel driver lp855x +==================== + +Backlight driver for LP855x ICs + +Supported chips: + + Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and + LP8557 + +Author: Milo(Woogyom) Kim + +Description +----------- + +* Brightness control + + Brightness can be controlled by the pwm input or the i2c command. + The lp855x driver supports both cases. + +* Device attributes + + 1) bl_ctl_mode + + Backlight control mode. + + Value: pwm based or register based + + 2) chip_id + + The lp855x chip id. + + Value: lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557 + +Platform data for lp855x +------------------------ + +For supporting platform specific data, the lp855x platform data can be used. + +* name: + Backlight driver name. If it is not defined, default name is set. +* device_control: + Value of DEVICE CONTROL register. +* initial_brightness: + Initial value of backlight brightness. +* period_ns: + Platform specific PWM period value. unit is nano. + Only valid when brightness is pwm input mode. +* size_program: + Total size of lp855x_rom_data. +* rom_data: + List of new eeprom/eprom registers. + +Examples +======== + +1) lp8552 platform data: i2c register mode with new eeprom data:: + + #define EEPROM_A5_ADDR 0xA5 + #define EEPROM_A5_VAL 0x4f /* EN_VSYNC=0 */ + + static struct lp855x_rom_data lp8552_eeprom_arr[] = { + {EEPROM_A5_ADDR, EEPROM_A5_VAL}, + }; + + static struct lp855x_platform_data lp8552_pdata = { + .name = "lcd-bl", + .device_control = I2C_CONFIG(LP8552), + .initial_brightness = INITIAL_BRT, + .size_program = ARRAY_SIZE(lp8552_eeprom_arr), + .rom_data = lp8552_eeprom_arr, + }; + +2) lp8556 platform data: pwm input mode with default rom data:: + + static struct lp855x_platform_data lp8556_pdata = { + .device_control = PWM_CONFIG(LP8556), + .initial_brightness = INITIAL_BRT, + .period_ns = 1000000, + }; diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 0f281f4f648f..b4c993ff7655 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -66,6 +66,7 @@ available subsections can be seen below. soundwire/index fpga/index acpi/index + backlight/lp855x-driver.rst generic-counter .. only:: subproject and html -- cgit From fe34c89d25429e079ba67416529514120dd715f8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 12:34:59 -0300 Subject: docs: driver-model: move it to the driver-api book The audience for the Kernel driver-model is clearly Kernel hackers. Signed-off-by: Mauro Carvalho Chehab Acked-by: Jeff Kirsher # ice driver changes --- Documentation/driver-api/driver-model/binding.rst | 98 +++++ Documentation/driver-api/driver-model/bus.rst | 146 +++++++ Documentation/driver-api/driver-model/class.rst | 149 +++++++ .../driver-api/driver-model/design-patterns.rst | 116 ++++++ Documentation/driver-api/driver-model/device.rst | 109 +++++ Documentation/driver-api/driver-model/devres.rst | 414 +++++++++++++++++++ Documentation/driver-api/driver-model/driver.rst | 223 ++++++++++ Documentation/driver-api/driver-model/index.rst | 24 ++ Documentation/driver-api/driver-model/overview.rst | 124 ++++++ Documentation/driver-api/driver-model/platform.rst | 246 +++++++++++ Documentation/driver-api/driver-model/porting.rst | 448 +++++++++++++++++++++ Documentation/driver-api/gpio/driver.rst | 2 +- Documentation/driver-api/index.rst | 1 + Documentation/driver-model/binding.rst | 98 ----- Documentation/driver-model/bus.rst | 146 ------- Documentation/driver-model/class.rst | 149 ------- Documentation/driver-model/design-patterns.rst | 116 ------ Documentation/driver-model/device.rst | 109 ----- Documentation/driver-model/devres.rst | 414 ------------------- Documentation/driver-model/driver.rst | 223 ---------- Documentation/driver-model/index.rst | 26 -- Documentation/driver-model/overview.rst | 124 ------ Documentation/driver-model/platform.rst | 246 ----------- Documentation/driver-model/porting.rst | 448 --------------------- Documentation/eisa.txt | 4 +- Documentation/filesystems/sysfs.txt | 2 +- Documentation/hwmon/submitting-patches.rst | 2 +- .../translations/zh_CN/filesystems/sysfs.txt | 2 +- 28 files changed, 2104 insertions(+), 2105 deletions(-) create mode 100644 Documentation/driver-api/driver-model/binding.rst create mode 100644 Documentation/driver-api/driver-model/bus.rst create mode 100644 Documentation/driver-api/driver-model/class.rst create mode 100644 Documentation/driver-api/driver-model/design-patterns.rst create mode 100644 Documentation/driver-api/driver-model/device.rst create mode 100644 Documentation/driver-api/driver-model/devres.rst create mode 100644 Documentation/driver-api/driver-model/driver.rst create mode 100644 Documentation/driver-api/driver-model/index.rst create mode 100644 Documentation/driver-api/driver-model/overview.rst create mode 100644 Documentation/driver-api/driver-model/platform.rst create mode 100644 Documentation/driver-api/driver-model/porting.rst delete mode 100644 Documentation/driver-model/binding.rst delete mode 100644 Documentation/driver-model/bus.rst delete mode 100644 Documentation/driver-model/class.rst delete mode 100644 Documentation/driver-model/design-patterns.rst delete mode 100644 Documentation/driver-model/device.rst delete mode 100644 Documentation/driver-model/devres.rst delete mode 100644 Documentation/driver-model/driver.rst delete mode 100644 Documentation/driver-model/index.rst delete mode 100644 Documentation/driver-model/overview.rst delete mode 100644 Documentation/driver-model/platform.rst delete mode 100644 Documentation/driver-model/porting.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst new file mode 100644 index 000000000000..7ea1d7a41e1d --- /dev/null +++ b/Documentation/driver-api/driver-model/binding.rst @@ -0,0 +1,98 @@ +============== +Driver Binding +============== + +Driver binding is the process of associating a device with a device +driver that can control it. Bus drivers have typically handled this +because there have been bus-specific structures to represent the +devices and the drivers. With generic device and device driver +structures, most of the binding can take place using common code. + + +Bus +~~~ + +The bus type structure contains a list of all devices that are on that bus +type in the system. When device_register is called for a device, it is +inserted into the end of this list. The bus object also contains a +list of all drivers of that bus type. When driver_register is called +for a driver, it is inserted at the end of this list. These are the +two events which trigger driver binding. + + +device_register +~~~~~~~~~~~~~~~ + +When a new device is added, the bus's list of drivers is iterated over +to find one that supports it. In order to determine that, the device +ID of the device must match one of the device IDs that the driver +supports. The format and semantics for comparing IDs is bus-specific. +Instead of trying to derive a complex state machine and matching +algorithm, it is up to the bus driver to provide a callback to compare +a device against the IDs of a driver. The bus returns 1 if a match was +found; 0 otherwise. + +int match(struct device * dev, struct device_driver * drv); + +If a match is found, the device's driver field is set to the driver +and the driver's probe callback is called. This gives the driver a +chance to verify that it really does support the hardware, and that +it's in a working state. + +Device Class +~~~~~~~~~~~~ + +Upon the successful completion of probe, the device is registered with +the class to which it belongs. Device drivers belong to one and only one +class, and that is set in the driver's devclass field. +devclass_add_device is called to enumerate the device within the class +and actually register it with the class, which happens with the +class's register_dev callback. + + +Driver +~~~~~~ + +When a driver is attached to a device, the device is inserted into the +driver's list of devices. + + +sysfs +~~~~~ + +A symlink is created in the bus's 'devices' directory that points to +the device's directory in the physical hierarchy. + +A symlink is created in the driver's 'devices' directory that points +to the device's directory in the physical hierarchy. + +A directory for the device is created in the class's directory. A +symlink is created in that directory that points to the device's +physical location in the sysfs tree. + +A symlink can be created (though this isn't done yet) in the device's +physical directory to either its class directory, or the class's +top-level directory. One can also be created to point to its driver's +directory also. + + +driver_register +~~~~~~~~~~~~~~~ + +The process is almost identical for when a new driver is added. +The bus's list of devices is iterated over to find a match. Devices +that already have a driver are skipped. All the devices are iterated +over, to bind as many devices as possible to the driver. + + +Removal +~~~~~~~ + +When a device is removed, the reference count for it will eventually +go to 0. When it does, the remove callback of the driver is called. It +is removed from the driver's list of devices and the reference count +of the driver is decremented. All symlinks between the two are removed. + +When a driver is removed, the list of devices that it supports is +iterated over, and the driver's remove callback is called for each +one. The device is removed from that list and the symlinks removed. diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst new file mode 100644 index 000000000000..016b15a6e8ea --- /dev/null +++ b/Documentation/driver-api/driver-model/bus.rst @@ -0,0 +1,146 @@ +========= +Bus Types +========= + +Definition +~~~~~~~~~~ +See the kerneldoc for the struct bus_type. + +int bus_register(struct bus_type * bus); + + +Declaration +~~~~~~~~~~~ + +Each bus type in the kernel (PCI, USB, etc) should declare one static +object of this type. They must initialize the name field, and may +optionally initialize the match callback:: + + struct bus_type pci_bus_type = { + .name = "pci", + .match = pci_bus_match, + }; + +The structure should be exported to drivers in a header file: + +extern struct bus_type pci_bus_type; + + +Registration +~~~~~~~~~~~~ + +When a bus driver is initialized, it calls bus_register. This +initializes the rest of the fields in the bus object and inserts it +into a global list of bus types. Once the bus object is registered, +the fields in it are usable by the bus driver. + + +Callbacks +~~~~~~~~~ + +match(): Attaching Drivers to Devices +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The format of device ID structures and the semantics for comparing +them are inherently bus-specific. Drivers typically declare an array +of device IDs of devices they support that reside in a bus-specific +driver structure. + +The purpose of the match callback is to give the bus an opportunity to +determine if a particular driver supports a particular device by +comparing the device IDs the driver supports with the device ID of a +particular device, without sacrificing bus-specific functionality or +type-safety. + +When a driver is registered with the bus, the bus's list of devices is +iterated over, and the match callback is called for each device that +does not have a driver associated with it. + + + +Device and Driver Lists +~~~~~~~~~~~~~~~~~~~~~~~ + +The lists of devices and drivers are intended to replace the local +lists that many buses keep. They are lists of struct devices and +struct device_drivers, respectively. Bus drivers are free to use the +lists as they please, but conversion to the bus-specific type may be +necessary. + +The LDM core provides helper functions for iterating over each list:: + + int bus_for_each_dev(struct bus_type * bus, struct device * start, + void * data, + int (*fn)(struct device *, void *)); + + int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, + void * data, int (*fn)(struct device_driver *, void *)); + +These helpers iterate over the respective list, and call the callback +for each device or driver in the list. All list accesses are +synchronized by taking the bus's lock (read currently). The reference +count on each object in the list is incremented before the callback is +called; it is decremented after the next object has been obtained. The +lock is not held when calling the callback. + + +sysfs +~~~~~~~~ +There is a top-level directory named 'bus'. + +Each bus gets a directory in the bus directory, along with two default +directories:: + + /sys/bus/pci/ + |-- devices + `-- drivers + +Drivers registered with the bus get a directory in the bus's drivers +directory:: + + /sys/bus/pci/ + |-- devices + `-- drivers + |-- Intel ICH + |-- Intel ICH Joystick + |-- agpgart + `-- e100 + +Each device that is discovered on a bus of that type gets a symlink in +the bus's devices directory to the device's directory in the physical +hierarchy:: + + /sys/bus/pci/ + |-- devices + | |-- 00:00.0 -> ../../../root/pci0/00:00.0 + | |-- 00:01.0 -> ../../../root/pci0/00:01.0 + | `-- 00:02.0 -> ../../../root/pci0/00:02.0 + `-- drivers + + +Exporting Attributes +~~~~~~~~~~~~~~~~~~~~ + +:: + + struct bus_attribute { + struct attribute attr; + ssize_t (*show)(struct bus_type *, char * buf); + ssize_t (*store)(struct bus_type *, const char * buf, size_t count); + }; + +Bus drivers can export attributes using the BUS_ATTR_RW macro that works +similarly to the DEVICE_ATTR_RW macro for devices. For example, a +definition like this:: + + static BUS_ATTR_RW(debug); + +is equivalent to declaring:: + + static bus_attribute bus_attr_debug; + +This can then be used to add and remove the attribute from the bus's +sysfs directory using:: + + int bus_create_file(struct bus_type *, struct bus_attribute *); + void bus_remove_file(struct bus_type *, struct bus_attribute *); diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst new file mode 100644 index 000000000000..fff55b80e86a --- /dev/null +++ b/Documentation/driver-api/driver-model/class.rst @@ -0,0 +1,149 @@ +============== +Device Classes +============== + +Introduction +~~~~~~~~~~~~ +A device class describes a type of device, like an audio or network +device. The following device classes have been identified: + + + + +Each device class defines a set of semantics and a programming interface +that devices of that class adhere to. Device drivers are the +implementation of that programming interface for a particular device on +a particular bus. + +Device classes are agnostic with respect to what bus a device resides +on. + + +Programming Interface +~~~~~~~~~~~~~~~~~~~~~ +The device class structure looks like:: + + + typedef int (*devclass_add)(struct device *); + typedef void (*devclass_remove)(struct device *); + +See the kerneldoc for the struct class. + +A typical device class definition would look like:: + + struct device_class input_devclass = { + .name = "input", + .add_device = input_add_device, + .remove_device = input_remove_device, + }; + +Each device class structure should be exported in a header file so it +can be used by drivers, extensions and interfaces. + +Device classes are registered and unregistered with the core using:: + + int devclass_register(struct device_class * cls); + void devclass_unregister(struct device_class * cls); + + +Devices +~~~~~~~ +As devices are bound to drivers, they are added to the device class +that the driver belongs to. Before the driver model core, this would +typically happen during the driver's probe() callback, once the device +has been initialized. It now happens after the probe() callback +finishes from the core. + +The device is enumerated in the class. Each time a device is added to +the class, the class's devnum field is incremented and assigned to the +device. The field is never decremented, so if the device is removed +from the class and re-added, it will receive a different enumerated +value. + +The class is allowed to create a class-specific structure for the +device and store it in the device's class_data pointer. + +There is no list of devices in the device class. Each driver has a +list of devices that it supports. The device class has a list of +drivers of that particular class. To access all of the devices in the +class, iterate over the device lists of each driver in the class. + + +Device Drivers +~~~~~~~~~~~~~~ +Device drivers are added to device classes when they are registered +with the core. A driver specifies the class it belongs to by setting +the struct device_driver::devclass field. + + +sysfs directory structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There is a top-level sysfs directory named 'class'. + +Each class gets a directory in the class directory, along with two +default subdirectories:: + + class/ + `-- input + |-- devices + `-- drivers + + +Drivers registered with the class get a symlink in the drivers/ directory +that points to the driver's directory (under its bus directory):: + + class/ + `-- input + |-- devices + `-- drivers + `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/ + + +Each device gets a symlink in the devices/ directory that points to the +device's directory in the physical hierarchy:: + + class/ + `-- input + |-- devices + | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/ + `-- drivers + + +Exporting Attributes +~~~~~~~~~~~~~~~~~~~~ + +:: + + struct devclass_attribute { + struct attribute attr; + ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off); + ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off); + }; + +Class drivers can export attributes using the DEVCLASS_ATTR macro that works +similarly to the DEVICE_ATTR macro for devices. For example, a definition +like this:: + + static DEVCLASS_ATTR(debug,0644,show_debug,store_debug); + +is equivalent to declaring:: + + static devclass_attribute devclass_attr_debug; + +The bus driver can add and remove the attribute from the class's +sysfs directory using:: + + int devclass_create_file(struct device_class *, struct devclass_attribute *); + void devclass_remove_file(struct device_class *, struct devclass_attribute *); + +In the example above, the file will be named 'debug' in placed in the +class's directory in sysfs. + + +Interfaces +~~~~~~~~~~ +There may exist multiple mechanisms for accessing the same device of a +particular class type. Device interfaces describe these mechanisms. + +When a device is added to a device class, the core attempts to add it +to every interface that is registered with the device class. diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst new file mode 100644 index 000000000000..41eb8f41f7dd --- /dev/null +++ b/Documentation/driver-api/driver-model/design-patterns.rst @@ -0,0 +1,116 @@ +============================= +Device Driver Design Patterns +============================= + +This document describes a few common design patterns found in device drivers. +It is likely that subsystem maintainers will ask driver developers to +conform to these design patterns. + +1. State Container +2. container_of() + + +1. State Container +~~~~~~~~~~~~~~~~~~ + +While the kernel contains a few device drivers that assume that they will +only be probed() once on a certain system (singletons), it is custom to assume +that the device the driver binds to will appear in several instances. This +means that the probe() function and all callbacks need to be reentrant. + +The most common way to achieve this is to use the state container design +pattern. It usually has this form:: + + struct foo { + spinlock_t lock; /* Example member */ + (...) + }; + + static int foo_probe(...) + { + struct foo *foo; + + foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL); + if (!foo) + return -ENOMEM; + spin_lock_init(&foo->lock); + (...) + } + +This will create an instance of struct foo in memory every time probe() is +called. This is our state container for this instance of the device driver. +Of course it is then necessary to always pass this instance of the +state around to all functions that need access to the state and its members. + +For example, if the driver is registering an interrupt handler, you would +pass around a pointer to struct foo like this:: + + static irqreturn_t foo_handler(int irq, void *arg) + { + struct foo *foo = arg; + (...) + } + + static int foo_probe(...) + { + struct foo *foo; + + (...) + ret = request_irq(irq, foo_handler, 0, "foo", foo); + } + +This way you always get a pointer back to the correct instance of foo in +your interrupt handler. + + +2. container_of() +~~~~~~~~~~~~~~~~~ + +Continuing on the above example we add an offloaded work:: + + struct foo { + spinlock_t lock; + struct workqueue_struct *wq; + struct work_struct offload; + (...) + }; + + static void foo_work(struct work_struct *work) + { + struct foo *foo = container_of(work, struct foo, offload); + + (...) + } + + static irqreturn_t foo_handler(int irq, void *arg) + { + struct foo *foo = arg; + + queue_work(foo->wq, &foo->offload); + (...) + } + + static int foo_probe(...) + { + struct foo *foo; + + foo->wq = create_singlethread_workqueue("foo-wq"); + INIT_WORK(&foo->offload, foo_work); + (...) + } + +The design pattern is the same for an hrtimer or something similar that will +return a single argument which is a pointer to a struct member in the +callback. + +container_of() is a macro defined in + +What container_of() does is to obtain a pointer to the containing struct from +a pointer to a member by a simple subtraction using the offsetof() macro from +standard C, which allows something similar to object oriented behaviours. +Notice that the contained member must not be a pointer, but an actual member +for this to work. + +We can see here that we avoid having global pointers to our struct foo * +instance this way, while still keeping the number of parameters passed to the +work function to a single pointer. diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst new file mode 100644 index 000000000000..2b868d49d349 --- /dev/null +++ b/Documentation/driver-api/driver-model/device.rst @@ -0,0 +1,109 @@ +========================== +The Basic Device Structure +========================== + +See the kerneldoc for the struct device. + + +Programming Interface +~~~~~~~~~~~~~~~~~~~~~ +The bus driver that discovers the device uses this to register the +device with the core:: + + int device_register(struct device * dev); + +The bus should initialize the following fields: + + - parent + - name + - bus_id + - bus + +A device is removed from the core when its reference count goes to +0. The reference count can be adjusted using:: + + struct device * get_device(struct device * dev); + void put_device(struct device * dev); + +get_device() will return a pointer to the struct device passed to it +if the reference is not already 0 (if it's in the process of being +removed already). + +A driver can access the lock in the device structure using:: + + void lock_device(struct device * dev); + void unlock_device(struct device * dev); + + +Attributes +~~~~~~~~~~ + +:: + + struct device_attribute { + struct attribute attr; + ssize_t (*show)(struct device *dev, struct device_attribute *attr, + char *buf); + ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + }; + +Attributes of devices can be exported by a device driver through sysfs. + +Please see Documentation/filesystems/sysfs.txt for more information +on how sysfs works. + +As explained in Documentation/kobject.txt, device attributes must be +created before the KOBJ_ADD uevent is generated. The only way to realize +that is by defining an attribute group. + +Attributes are declared using a macro called DEVICE_ATTR:: + + #define DEVICE_ATTR(name,mode,show,store) + +Example::: + + static DEVICE_ATTR(type, 0444, show_type, NULL); + static DEVICE_ATTR(power, 0644, show_power, store_power); + +This declares two structures of type struct device_attribute with respective +names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be +organized as follows into a group:: + + static struct attribute *dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_power.attr, + NULL, + }; + + static struct attribute_group dev_attr_group = { + .attrs = dev_attrs, + }; + + static const struct attribute_group *dev_attr_groups[] = { + &dev_attr_group, + NULL, + }; + +This array of groups can then be associated with a device by setting the +group pointer in struct device before device_register() is invoked:: + + dev->groups = dev_attr_groups; + device_register(dev); + +The device_register() function will use the 'groups' pointer to create the +device attributes and the device_unregister() function will use this pointer +to remove the device attributes. + +Word of warning: While the kernel allows device_create_file() and +device_remove_file() to be called on a device at any time, userspace has +strict expectations on when attributes get created. When a new device is +registered in the kernel, a uevent is generated to notify userspace (like +udev) that a new device is available. If attributes are added after the +device is registered, then userspace won't get notified and userspace will +not know about the new attributes. + +This is important for device driver that need to publish additional +attributes for a device at driver probe time. If the device driver simply +calls device_create_file() on the device structure passed to it, then +userspace will never be notified of the new attributes. diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst new file mode 100644 index 000000000000..4ac99122b5f1 --- /dev/null +++ b/Documentation/driver-api/driver-model/devres.rst @@ -0,0 +1,414 @@ +================================ +Devres - Managed Device Resource +================================ + +Tejun Heo + +First draft 10 January 2007 + +.. contents + + 1. Intro : Huh? Devres? + 2. Devres : Devres in a nutshell + 3. Devres Group : Group devres'es and release them together + 4. Details : Life time rules, calling context, ... + 5. Overhead : How much do we have to pay for this? + 6. List of managed interfaces: Currently implemented managed interfaces + + +1. Intro +-------- + +devres came up while trying to convert libata to use iomap. Each +iomapped address should be kept and unmapped on driver detach. For +example, a plain SFF ATA controller (that is, good old PCI IDE) in +native mode makes use of 5 PCI BARs and all of them should be +maintained. + +As with many other device drivers, libata low level drivers have +sufficient bugs in ->remove and ->probe failure path. Well, yes, +that's probably because libata low level driver developers are lazy +bunch, but aren't all low level driver developers? After spending a +day fiddling with braindamaged hardware with no document or +braindamaged document, if it's finally working, well, it's working. + +For one reason or another, low level drivers don't receive as much +attention or testing as core code, and bugs on driver detach or +initialization failure don't happen often enough to be noticeable. +Init failure path is worse because it's much less travelled while +needs to handle multiple entry points. + +So, many low level drivers end up leaking resources on driver detach +and having half broken failure path implementation in ->probe() which +would leak resources or even cause oops when failure occurs. iomap +adds more to this mix. So do msi and msix. + + +2. Devres +--------- + +devres is basically linked list of arbitrarily sized memory areas +associated with a struct device. Each devres entry is associated with +a release function. A devres can be released in several ways. No +matter what, all devres entries are released on driver detach. On +release, the associated release function is invoked and then the +devres entry is freed. + +Managed interface is created for resources commonly used by device +drivers using devres. For example, coherent DMA memory is acquired +using dma_alloc_coherent(). The managed version is called +dmam_alloc_coherent(). It is identical to dma_alloc_coherent() except +for the DMA memory allocated using it is managed and will be +automatically released on driver detach. Implementation looks like +the following:: + + struct dma_devres { + size_t size; + void *vaddr; + dma_addr_t dma_handle; + }; + + static void dmam_coherent_release(struct device *dev, void *res) + { + struct dma_devres *this = res; + + dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle); + } + + dmam_alloc_coherent(dev, size, dma_handle, gfp) + { + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp); + ... + + /* alloc DMA memory as usual */ + vaddr = dma_alloc_coherent(...); + ... + + /* record size, vaddr, dma_handle in dr */ + dr->vaddr = vaddr; + ... + + devres_add(dev, dr); + + return vaddr; + } + +If a driver uses dmam_alloc_coherent(), the area is guaranteed to be +freed whether initialization fails half-way or the device gets +detached. If most resources are acquired using managed interface, a +driver can have much simpler init and exit code. Init path basically +looks like the following:: + + my_init_one() + { + struct mydev *d; + + d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->ring = dmam_alloc_coherent(...); + if (!d->ring) + return -ENOMEM; + + if (check something) + return -EINVAL; + ... + + return register_to_upper_layer(d); + } + +And exit path:: + + my_remove_one() + { + unregister_from_upper_layer(d); + shutdown_my_hardware(); + } + +As shown above, low level drivers can be simplified a lot by using +devres. Complexity is shifted from less maintained low level drivers +to better maintained higher layer. Also, as init failure path is +shared with exit path, both can get more testing. + +Note though that when converting current calls or assignments to +managed devm_* versions it is up to you to check if internal operations +like allocating memory, have failed. Managed resources pertains to the +freeing of these resources *only* - all other checks needed are still +on you. In some cases this may mean introducing checks that were not +necessary before moving to the managed devm_* calls. + + +3. Devres group +--------------- + +Devres entries can be grouped using devres group. When a group is +released, all contained normal devres entries and properly nested +groups are released. One usage is to rollback series of acquired +resources on failure. For example:: + + if (!devres_open_group(dev, NULL, GFP_KERNEL)) + return -ENOMEM; + + acquire A; + if (failed) + goto err; + + acquire B; + if (failed) + goto err; + ... + + devres_remove_group(dev, NULL); + return 0; + + err: + devres_release_group(dev, NULL); + return err_code; + +As resource acquisition failure usually means probe failure, constructs +like above are usually useful in midlayer driver (e.g. libata core +layer) where interface function shouldn't have side effect on failure. +For LLDs, just returning error code suffices in most cases. + +Each group is identified by `void *id`. It can either be explicitly +specified by @id argument to devres_open_group() or automatically +created by passing NULL as @id as in the above example. In both +cases, devres_open_group() returns the group's id. The returned id +can be passed to other devres functions to select the target group. +If NULL is given to those functions, the latest open group is +selected. + +For example, you can do something like the following:: + + int my_midlayer_create_something() + { + if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL)) + return -ENOMEM; + + ... + + devres_close_group(dev, my_midlayer_create_something); + return 0; + } + + void my_midlayer_destroy_something() + { + devres_release_group(dev, my_midlayer_create_something); + } + + +4. Details +---------- + +Lifetime of a devres entry begins on devres allocation and finishes +when it is released or destroyed (removed and freed) - no reference +counting. + +devres core guarantees atomicity to all basic devres operations and +has support for single-instance devres types (atomic +lookup-and-add-if-not-found). Other than that, synchronizing +concurrent accesses to allocated devres data is caller's +responsibility. This is usually non-issue because bus ops and +resource allocations already do the job. + +For an example of single-instance devres type, read pcim_iomap_table() +in lib/devres.c. + +All devres interface functions can be called without context if the +right gfp mask is given. + + +5. Overhead +----------- + +Each devres bookkeeping info is allocated together with requested data +area. With debug option turned off, bookkeeping info occupies 16 +bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded +up to ull alignment). If singly linked list is used, it can be +reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit). + +Each devres group occupies 8 pointers. It can be reduced to 6 if +singly linked list is used. + +Memory space overhead on ahci controller with two ports is between 300 +and 400 bytes on 32bit machine after naive conversion (we can +certainly invest a bit more effort into libata core layer). + + +6. List of managed interfaces +----------------------------- + +CLOCK + devm_clk_get() + devm_clk_get_optional() + devm_clk_put() + devm_clk_hw_register() + devm_of_clk_add_hw_provider() + devm_clk_hw_register_clkdev() + +DMA + dmaenginem_async_device_register() + dmam_alloc_coherent() + dmam_alloc_attrs() + dmam_free_coherent() + dmam_pool_create() + dmam_pool_destroy() + +DRM + devm_drm_dev_init() + +GPIO + devm_gpiod_get() + devm_gpiod_get_index() + devm_gpiod_get_index_optional() + devm_gpiod_get_optional() + devm_gpiod_put() + devm_gpiod_unhinge() + devm_gpiochip_add_data() + devm_gpio_request() + devm_gpio_request_one() + devm_gpio_free() + +I2C + devm_i2c_new_dummy_device() + +IIO + devm_iio_device_alloc() + devm_iio_device_free() + devm_iio_device_register() + devm_iio_device_unregister() + devm_iio_kfifo_allocate() + devm_iio_kfifo_free() + devm_iio_triggered_buffer_setup() + devm_iio_triggered_buffer_cleanup() + devm_iio_trigger_alloc() + devm_iio_trigger_free() + devm_iio_trigger_register() + devm_iio_trigger_unregister() + devm_iio_channel_get() + devm_iio_channel_release() + devm_iio_channel_get_all() + devm_iio_channel_release_all() + +INPUT + devm_input_allocate_device() + +IO region + devm_release_mem_region() + devm_release_region() + devm_release_resource() + devm_request_mem_region() + devm_request_region() + devm_request_resource() + +IOMAP + devm_ioport_map() + devm_ioport_unmap() + devm_ioremap() + devm_ioremap_nocache() + devm_ioremap_wc() + devm_ioremap_resource() : checks resource, requests memory region, ioremaps + devm_iounmap() + pcim_iomap() + pcim_iomap_regions() : do request_region() and iomap() on multiple BARs + pcim_iomap_table() : array of mapped addresses indexed by BAR + pcim_iounmap() + +IRQ + devm_free_irq() + devm_request_any_context_irq() + devm_request_irq() + devm_request_threaded_irq() + devm_irq_alloc_descs() + devm_irq_alloc_desc() + devm_irq_alloc_desc_at() + devm_irq_alloc_desc_from() + devm_irq_alloc_descs_from() + devm_irq_alloc_generic_chip() + devm_irq_setup_generic_chip() + devm_irq_sim_init() + +LED + devm_led_classdev_register() + devm_led_classdev_unregister() + +MDIO + devm_mdiobus_alloc() + devm_mdiobus_alloc_size() + devm_mdiobus_free() + +MEM + devm_free_pages() + devm_get_free_pages() + devm_kasprintf() + devm_kcalloc() + devm_kfree() + devm_kmalloc() + devm_kmalloc_array() + devm_kmemdup() + devm_kstrdup() + devm_kvasprintf() + devm_kzalloc() + +MFD + devm_mfd_add_devices() + +MUX + devm_mux_chip_alloc() + devm_mux_chip_register() + devm_mux_control_get() + +PER-CPU MEM + devm_alloc_percpu() + devm_free_percpu() + +PCI + devm_pci_alloc_host_bridge() : managed PCI host bridge allocation + devm_pci_remap_cfgspace() : ioremap PCI configuration space + devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource + pcim_enable_device() : after success, all PCI ops become managed + pcim_pin_device() : keep PCI device enabled after release + +PHY + devm_usb_get_phy() + devm_usb_put_phy() + +PINCTRL + devm_pinctrl_get() + devm_pinctrl_put() + devm_pinctrl_register() + devm_pinctrl_unregister() + +POWER + devm_reboot_mode_register() + devm_reboot_mode_unregister() + +PWM + devm_pwm_get() + devm_pwm_put() + +REGULATOR + devm_regulator_bulk_get() + devm_regulator_get() + devm_regulator_put() + devm_regulator_register() + +RESET + devm_reset_control_get() + devm_reset_controller_register() + +SERDEV + devm_serdev_device_open() + +SLAVE DMA ENGINE + devm_acpi_dma_controller_register() + +SPI + devm_spi_register_master() + +WATCHDOG + devm_watchdog_register_device() diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst new file mode 100644 index 000000000000..11d281506a04 --- /dev/null +++ b/Documentation/driver-api/driver-model/driver.rst @@ -0,0 +1,223 @@ +============== +Device Drivers +============== + +See the kerneldoc for the struct device_driver. + + +Allocation +~~~~~~~~~~ + +Device drivers are statically allocated structures. Though there may +be multiple devices in a system that a driver supports, struct +device_driver represents the driver as a whole (not a particular +device instance). + +Initialization +~~~~~~~~~~~~~~ + +The driver must initialize at least the name and bus fields. It should +also initialize the devclass field (when it arrives), so it may obtain +the proper linkage internally. It should also initialize as many of +the callbacks as possible, though each is optional. + +Declaration +~~~~~~~~~~~ + +As stated above, struct device_driver objects are statically +allocated. Below is an example declaration of the eepro100 +driver. This declaration is hypothetical only; it relies on the driver +being converted completely to the new model:: + + static struct device_driver eepro100_driver = { + .name = "eepro100", + .bus = &pci_bus_type, + + .probe = eepro100_probe, + .remove = eepro100_remove, + .suspend = eepro100_suspend, + .resume = eepro100_resume, + }; + +Most drivers will not be able to be converted completely to the new +model because the bus they belong to has a bus-specific structure with +bus-specific fields that cannot be generalized. + +The most common example of this are device ID structures. A driver +typically defines an array of device IDs that it supports. The format +of these structures and the semantics for comparing device IDs are +completely bus-specific. Defining them as bus-specific entities would +sacrifice type-safety, so we keep bus-specific structures around. + +Bus-specific drivers should include a generic struct device_driver in +the definition of the bus-specific driver. Like this:: + + struct pci_driver { + const struct pci_device_id *id_table; + struct device_driver driver; + }; + +A definition that included bus-specific fields would look like +(using the eepro100 driver again):: + + static struct pci_driver eepro100_driver = { + .id_table = eepro100_pci_tbl, + .driver = { + .name = "eepro100", + .bus = &pci_bus_type, + .probe = eepro100_probe, + .remove = eepro100_remove, + .suspend = eepro100_suspend, + .resume = eepro100_resume, + }, + }; + +Some may find the syntax of embedded struct initialization awkward or +even a bit ugly. So far, it's the best way we've found to do what we want... + +Registration +~~~~~~~~~~~~ + +:: + + int driver_register(struct device_driver *drv); + +The driver registers the structure on startup. For drivers that have +no bus-specific fields (i.e. don't have a bus-specific driver +structure), they would use driver_register and pass a pointer to their +struct device_driver object. + +Most drivers, however, will have a bus-specific structure and will +need to register with the bus using something like pci_driver_register. + +It is important that drivers register their driver structure as early as +possible. Registration with the core initializes several fields in the +struct device_driver object, including the reference count and the +lock. These fields are assumed to be valid at all times and may be +used by the device model core or the bus driver. + + +Transition Bus Drivers +~~~~~~~~~~~~~~~~~~~~~~ + +By defining wrapper functions, the transition to the new model can be +made easier. Drivers can ignore the generic structure altogether and +let the bus wrapper fill in the fields. For the callbacks, the bus can +define generic callbacks that forward the call to the bus-specific +callbacks of the drivers. + +This solution is intended to be only temporary. In order to get class +information in the driver, the drivers must be modified anyway. Since +converting drivers to the new model should reduce some infrastructural +complexity and code size, it is recommended that they are converted as +class information is added. + +Access +~~~~~~ + +Once the object has been registered, it may access the common fields of +the object, like the lock and the list of devices:: + + int driver_for_each_dev(struct device_driver *drv, void *data, + int (*callback)(struct device *dev, void *data)); + +The devices field is a list of all the devices that have been bound to +the driver. The LDM core provides a helper function to operate on all +the devices a driver controls. This helper locks the driver on each +node access, and does proper reference counting on each device as it +accesses it. + + +sysfs +~~~~~ + +When a driver is registered, a sysfs directory is created in its +bus's directory. In this directory, the driver can export an interface +to userspace to control operation of the driver on a global basis; +e.g. toggling debugging output in the driver. + +A future feature of this directory will be a 'devices' directory. This +directory will contain symlinks to the directories of devices it +supports. + + + +Callbacks +~~~~~~~~~ + +:: + + int (*probe) (struct device *dev); + +The probe() entry is called in task context, with the bus's rwsem locked +and the driver partially bound to the device. Drivers commonly use +container_of() to convert "dev" to a bus-specific type, both in probe() +and other routines. That type often provides device resource data, such +as pci_dev.resource[] or platform_device.resources, which is used in +addition to dev->platform_data to initialize the driver. + +This callback holds the driver-specific logic to bind the driver to a +given device. That includes verifying that the device is present, that +it's a version the driver can handle, that driver data structures can +be allocated and initialized, and that any hardware can be initialized. +Drivers often store a pointer to their state with dev_set_drvdata(). +When the driver has successfully bound itself to that device, then probe() +returns zero and the driver model code will finish its part of binding +the driver to that device. + +A driver's probe() may return a negative errno value to indicate that +the driver did not bind to this device, in which case it should have +released all resources it allocated:: + + int (*remove) (struct device *dev); + +remove is called to unbind a driver from a device. This may be +called if a device is physically removed from the system, if the +driver module is being unloaded, during a reboot sequence, or +in other cases. + +It is up to the driver to determine if the device is present or +not. It should free any resources allocated specifically for the +device; i.e. anything in the device's driver_data field. + +If the device is still present, it should quiesce the device and place +it into a supported low-power state:: + + int (*suspend) (struct device *dev, pm_message_t state); + +suspend is called to put the device in a low power state:: + + int (*resume) (struct device *dev); + +Resume is used to bring a device back from a low power state. + + +Attributes +~~~~~~~~~~ + +:: + + struct driver_attribute { + struct attribute attr; + ssize_t (*show)(struct device_driver *driver, char *buf); + ssize_t (*store)(struct device_driver *, const char *buf, size_t count); + }; + +Device drivers can export attributes via their sysfs directories. +Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO +macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO +macros. + +Example:: + + DRIVER_ATTR_RW(debug); + +This is equivalent to declaring:: + + struct driver_attribute driver_attr_debug; + +This can then be used to add and remove the attribute from the +driver's directory using:: + + int driver_create_file(struct device_driver *, const struct driver_attribute *); + void driver_remove_file(struct device_driver *, const struct driver_attribute *); diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst new file mode 100644 index 000000000000..755016422269 --- /dev/null +++ b/Documentation/driver-api/driver-model/index.rst @@ -0,0 +1,24 @@ +============ +Driver Model +============ + +.. toctree:: + :maxdepth: 1 + + binding + bus + class + design-patterns + device + devres + driver + overview + platform + porting + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst new file mode 100644 index 000000000000..d4d1e9b40e0c --- /dev/null +++ b/Documentation/driver-api/driver-model/overview.rst @@ -0,0 +1,124 @@ +============================= +The Linux Kernel Device Model +============================= + +Patrick Mochel + +Drafted 26 August 2002 +Updated 31 January 2006 + + +Overview +~~~~~~~~ + +The Linux Kernel Driver Model is a unification of all the disparate driver +models that were previously used in the kernel. It is intended to augment the +bus-specific drivers for bridges and devices by consolidating a set of data +and operations into globally accessible data structures. + +Traditional driver models implemented some sort of tree-like structure +(sometimes just a list) for the devices they control. There wasn't any +uniformity across the different bus types. + +The current driver model provides a common, uniform data model for describing +a bus and the devices that can appear under the bus. The unified bus +model includes a set of common attributes which all busses carry, and a set +of common callbacks, such as device discovery during bus probing, bus +shutdown, bus power management, etc. + +The common device and bridge interface reflects the goals of the modern +computer: namely the ability to do seamless device "plug and play", power +management, and hot plug. In particular, the model dictated by Intel and +Microsoft (namely ACPI) ensures that almost every device on almost any bus +on an x86-compatible system can work within this paradigm. Of course, +not every bus is able to support all such operations, although most +buses support most of those operations. + + +Downstream Access +~~~~~~~~~~~~~~~~~ + +Common data fields have been moved out of individual bus layers into a common +data structure. These fields must still be accessed by the bus layers, +and sometimes by the device-specific drivers. + +Other bus layers are encouraged to do what has been done for the PCI layer. +struct pci_dev now looks like this:: + + struct pci_dev { + ... + + struct device dev; /* Generic device interface */ + ... + }; + +Note first that the struct device dev within the struct pci_dev is +statically allocated. This means only one allocation on device discovery. + +Note also that that struct device dev is not necessarily defined at the +front of the pci_dev structure. This is to make people think about what +they're doing when switching between the bus driver and the global driver, +and to discourage meaningless and incorrect casts between the two. + +The PCI bus layer freely accesses the fields of struct device. It knows about +the structure of struct pci_dev, and it should know the structure of struct +device. Individual PCI device drivers that have been converted to the current +driver model generally do not and should not touch the fields of struct device, +unless there is a compelling reason to do so. + +The above abstraction prevents unnecessary pain during transitional phases. +If it were not done this way, then when a field was renamed or removed, every +downstream driver would break. On the other hand, if only the bus layer +(and not the device layer) accesses the struct device, it is only the bus +layer that needs to change. + + +User Interface +~~~~~~~~~~~~~~ + +By virtue of having a complete hierarchical view of all the devices in the +system, exporting a complete hierarchical view to userspace becomes relatively +easy. This has been accomplished by implementing a special purpose virtual +file system named sysfs. + +Almost all mainstream Linux distros mount this filesystem automatically; you +can see some variation of the following in the output of the "mount" command:: + + $ mount + ... + none on /sys type sysfs (rw,noexec,nosuid,nodev) + ... + $ + +The auto-mounting of sysfs is typically accomplished by an entry similar to +the following in the /etc/fstab file:: + + none /sys sysfs defaults 0 0 + +or something similar in the /lib/init/fstab file on Debian-based systems:: + + none /sys sysfs nodev,noexec,nosuid 0 0 + +If sysfs is not automatically mounted, you can always do it manually with:: + + # mount -t sysfs sysfs /sys + +Whenever a device is inserted into the tree, a directory is created for it. +This directory may be populated at each layer of discovery - the global layer, +the bus layer, or the device layer. + +The global layer currently creates two files - 'name' and 'power'. The +former only reports the name of the device. The latter reports the +current power state of the device. It will also be used to set the current +power state. + +The bus layer may also create files for the devices it finds while probing the +bus. For example, the PCI layer currently creates 'irq' and 'resource' files +for each PCI device. + +A device-specific driver may also export files in its directory to expose +device-specific data or tunable interfaces. + +More information about the sysfs directory layout can be found in +the other documents in this directory and in the file +Documentation/filesystems/sysfs.txt. diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst new file mode 100644 index 000000000000..334dd4071ae4 --- /dev/null +++ b/Documentation/driver-api/driver-model/platform.rst @@ -0,0 +1,246 @@ +============================ +Platform Devices and Drivers +============================ + +See for the driver model interface to the +platform bus: platform_device, and platform_driver. This pseudo-bus +is used to connect devices on busses with minimal infrastructure, +like those used to integrate peripherals on many system-on-chip +processors, or some "legacy" PC interconnects; as opposed to large +formally specified ones like PCI or USB. + + +Platform devices +~~~~~~~~~~~~~~~~ +Platform devices are devices that typically appear as autonomous +entities in the system. This includes legacy port-based devices and +host bridges to peripheral buses, and most controllers integrated +into system-on-chip platforms. What they usually have in common +is direct addressing from a CPU bus. Rarely, a platform_device will +be connected through a segment of some other kind of bus; but its +registers will still be directly addressable. + +Platform devices are given a name, used in driver binding, and a +list of resources such as addresses and IRQs:: + + struct platform_device { + const char *name; + u32 id; + struct device dev; + u32 num_resources; + struct resource *resource; + }; + + +Platform drivers +~~~~~~~~~~~~~~~~ +Platform drivers follow the standard driver model convention, where +discovery/enumeration is handled outside the drivers, and drivers +provide probe() and remove() methods. They support power management +and shutdown notifications using the standard conventions:: + + struct platform_driver { + int (*probe)(struct platform_device *); + int (*remove)(struct platform_device *); + void (*shutdown)(struct platform_device *); + int (*suspend)(struct platform_device *, pm_message_t state); + int (*suspend_late)(struct platform_device *, pm_message_t state); + int (*resume_early)(struct platform_device *); + int (*resume)(struct platform_device *); + struct device_driver driver; + }; + +Note that probe() should in general verify that the specified device hardware +actually exists; sometimes platform setup code can't be sure. The probing +can use device resources, including clocks, and device platform_data. + +Platform drivers register themselves the normal way:: + + int platform_driver_register(struct platform_driver *drv); + +Or, in common situations where the device is known not to be hot-pluggable, +the probe() routine can live in an init section to reduce the driver's +runtime memory footprint:: + + int platform_driver_probe(struct platform_driver *drv, + int (*probe)(struct platform_device *)) + +Kernel modules can be composed of several platform drivers. The platform core +provides helpers to register and unregister an array of drivers:: + + int __platform_register_drivers(struct platform_driver * const *drivers, + unsigned int count, struct module *owner); + void platform_unregister_drivers(struct platform_driver * const *drivers, + unsigned int count); + +If one of the drivers fails to register, all drivers registered up to that +point will be unregistered in reverse order. Note that there is a convenience +macro that passes THIS_MODULE as owner parameter:: + + #define platform_register_drivers(drivers, count) + + +Device Enumeration +~~~~~~~~~~~~~~~~~~ +As a rule, platform specific (and often board-specific) setup code will +register platform devices:: + + int platform_device_register(struct platform_device *pdev); + + int platform_add_devices(struct platform_device **pdevs, int ndev); + +The general rule is to register only those devices that actually exist, +but in some cases extra devices might be registered. For example, a kernel +might be configured to work with an external network adapter that might not +be populated on all boards, or likewise to work with an integrated controller +that some boards might not hook up to any peripherals. + +In some cases, boot firmware will export tables describing the devices +that are populated on a given board. Without such tables, often the +only way for system setup code to set up the correct devices is to build +a kernel for a specific target board. Such board-specific kernels are +common with embedded and custom systems development. + +In many cases, the memory and IRQ resources associated with the platform +device are not enough to let the device's driver work. Board setup code +will often provide additional information using the device's platform_data +field to hold additional information. + +Embedded systems frequently need one or more clocks for platform devices, +which are normally kept off until they're actively needed (to save power). +System setup also associates those clocks with the device, so that that +calls to clk_get(&pdev->dev, clock_name) return them as needed. + + +Legacy Drivers: Device Probing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Some drivers are not fully converted to the driver model, because they take +on a non-driver role: the driver registers its platform device, rather than +leaving that for system infrastructure. Such drivers can't be hotplugged +or coldplugged, since those mechanisms require device creation to be in a +different system component than the driver. + +The only "good" reason for this is to handle older system designs which, like +original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware +configuration. Newer systems have largely abandoned that model, in favor of +bus-level support for dynamic configuration (PCI, USB), or device tables +provided by the boot firmware (e.g. PNPACPI on x86). There are too many +conflicting options about what might be where, and even educated guesses by +an operating system will be wrong often enough to make trouble. + +This style of driver is discouraged. If you're updating such a driver, +please try to move the device enumeration to a more appropriate location, +outside the driver. This will usually be cleanup, since such drivers +tend to already have "normal" modes, such as ones using device nodes that +were created by PNP or by platform device setup. + +None the less, there are some APIs to support such legacy drivers. Avoid +using these calls except with such hotplug-deficient drivers:: + + struct platform_device *platform_device_alloc( + const char *name, int id); + +You can use platform_device_alloc() to dynamically allocate a device, which +you will then initialize with resources and platform_device_register(). +A better solution is usually:: + + struct platform_device *platform_device_register_simple( + const char *name, int id, + struct resource *res, unsigned int nres); + +You can use platform_device_register_simple() as a one-step call to allocate +and register a device. + + +Device Naming and Driver Binding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The platform_device.dev.bus_id is the canonical name for the devices. +It's built from two components: + + * platform_device.name ... which is also used to for driver matching. + + * platform_device.id ... the device instance number, or else "-1" + to indicate there's only one. + +These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and +"serial/3" indicates bus_id "serial.3"; both would use the platform_driver +named "serial". While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id) +and use the platform_driver called "my_rtc". + +Driver binding is performed automatically by the driver core, invoking +driver probe() after finding a match between device and driver. If the +probe() succeeds, the driver and device are bound as usual. There are +three different ways to find such a match: + + - Whenever a device is registered, the drivers for that bus are + checked for matches. Platform devices should be registered very + early during system boot. + + - When a driver is registered using platform_driver_register(), all + unbound devices on that bus are checked for matches. Drivers + usually register later during booting, or by module loading. + + - Registering a driver using platform_driver_probe() works just like + using platform_driver_register(), except that the driver won't + be probed later if another device registers. (Which is OK, since + this interface is only for use with non-hotpluggable devices.) + + +Early Platform Devices and Drivers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The early platform interfaces provide platform data to platform device +drivers early on during the system boot. The code is built on top of the +early_param() command line parsing and can be executed very early on. + +Example: "earlyprintk" class early serial console in 6 steps + +1. Registering early platform device data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The architecture code registers platform device data using the function +early_platform_add_devices(). In the case of early serial console this +should be hardware configuration for the serial port. Devices registered +at this point will later on be matched against early platform drivers. + +2. Parsing kernel command line +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The architecture code calls parse_early_param() to parse the kernel +command line. This will execute all matching early_param() callbacks. +User specified early platform devices will be registered at this point. +For the early serial console case the user can specify port on the +kernel command line as "earlyprintk=serial.0" where "earlyprintk" is +the class string, "serial" is the name of the platform driver and +0 is the platform device id. If the id is -1 then the dot and the +id can be omitted. + +3. Installing early platform drivers belonging to a certain class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The architecture code may optionally force registration of all early +platform drivers belonging to a certain class using the function +early_platform_driver_register_all(). User specified devices from +step 2 have priority over these. This step is omitted by the serial +driver example since the early serial driver code should be disabled +unless the user has specified port on the kernel command line. + +4. Early platform driver registration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Compiled-in platform drivers making use of early_platform_init() are +automatically registered during step 2 or 3. The serial driver example +should use early_platform_init("earlyprintk", &platform_driver). + +5. Probing of early platform drivers belonging to a certain class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The architecture code calls early_platform_driver_probe() to match +registered early platform devices associated with a certain class with +registered early platform drivers. Matched devices will get probed(). +This step can be executed at any point during the early boot. As soon +as possible may be good for the serial port case. + +6. Inside the early platform driver probe() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The driver code needs to take special care during early boot, especially +when it comes to memory allocation and interrupt registration. The code +in the probe() function can use is_early_platform_device() to check if +it is called at early platform device or at the regular platform device +time. The early serial driver performs register_console() at this point. + +For further information, see . diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst new file mode 100644 index 000000000000..931ea879af3f --- /dev/null +++ b/Documentation/driver-api/driver-model/porting.rst @@ -0,0 +1,448 @@ +======================================= +Porting Drivers to the New Driver Model +======================================= + +Patrick Mochel + +7 January 2003 + + +Overview + +Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of +various driver types and concepts. + +Most of the work of porting devices drivers to the new model happens +at the bus driver layer. This was intentional, to minimize the +negative effect on kernel drivers, and to allow a gradual transition +of bus drivers. + +In a nutshell, the driver model consists of a set of objects that can +be embedded in larger, bus-specific objects. Fields in these generic +objects can replace fields in the bus-specific objects. + +The generic objects must be registered with the driver model core. By +doing so, they will exported via the sysfs filesystem. sysfs can be +mounted by doing:: + + # mount -t sysfs sysfs /sys + + + +The Process + +Step 0: Read include/linux/device.h for object and function definitions. + +Step 1: Registering the bus driver. + + +- Define a struct bus_type for the bus driver:: + + struct bus_type pci_bus_type = { + .name = "pci", + }; + + +- Register the bus type. + + This should be done in the initialization function for the bus type, + which is usually the module_init(), or equivalent, function:: + + static int __init pci_driver_init(void) + { + return bus_register(&pci_bus_type); + } + + subsys_initcall(pci_driver_init); + + + The bus type may be unregistered (if the bus driver may be compiled + as a module) by doing:: + + bus_unregister(&pci_bus_type); + + +- Export the bus type for others to use. + + Other code may wish to reference the bus type, so declare it in a + shared header file and export the symbol. + +From include/linux/pci.h:: + + extern struct bus_type pci_bus_type; + + +From file the above code appears in:: + + EXPORT_SYMBOL(pci_bus_type); + + + +- This will cause the bus to show up in /sys/bus/pci/ with two + subdirectories: 'devices' and 'drivers':: + + # tree -d /sys/bus/pci/ + /sys/bus/pci/ + |-- devices + `-- drivers + + + +Step 2: Registering Devices. + +struct device represents a single device. It mainly contains metadata +describing the relationship the device has to other entities. + + +- Embed a struct device in the bus-specific device type:: + + + struct pci_dev { + ... + struct device dev; /* Generic device interface */ + ... + }; + + It is recommended that the generic device not be the first item in + the struct to discourage programmers from doing mindless casts + between the object types. Instead macros, or inline functions, + should be created to convert from the generic object type:: + + + #define to_pci_dev(n) container_of(n, struct pci_dev, dev) + + or + + static inline struct pci_dev * to_pci_dev(struct kobject * kobj) + { + return container_of(n, struct pci_dev, dev); + } + + This allows the compiler to verify type-safety of the operations + that are performed (which is Good). + + +- Initialize the device on registration. + + When devices are discovered or registered with the bus type, the + bus driver should initialize the generic device. The most important + things to initialize are the bus_id, parent, and bus fields. + + The bus_id is an ASCII string that contains the device's address on + the bus. The format of this string is bus-specific. This is + necessary for representing devices in sysfs. + + parent is the physical parent of the device. It is important that + the bus driver sets this field correctly. + + The driver model maintains an ordered list of devices that it uses + for power management. This list must be in order to guarantee that + devices are shutdown before their physical parents, and vice versa. + The order of this list is determined by the parent of registered + devices. + + Also, the location of the device's sysfs directory depends on a + device's parent. sysfs exports a directory structure that mirrors + the device hierarchy. Accurately setting the parent guarantees that + sysfs will accurately represent the hierarchy. + + The device's bus field is a pointer to the bus type the device + belongs to. This should be set to the bus_type that was declared + and initialized before. + + Optionally, the bus driver may set the device's name and release + fields. + + The name field is an ASCII string describing the device, like + + "ATI Technologies Inc Radeon QD" + + The release field is a callback that the driver model core calls + when the device has been removed, and all references to it have + been released. More on this in a moment. + + +- Register the device. + + Once the generic device has been initialized, it can be registered + with the driver model core by doing:: + + device_register(&dev->dev); + + It can later be unregistered by doing:: + + device_unregister(&dev->dev); + + This should happen on buses that support hotpluggable devices. + If a bus driver unregisters a device, it should not immediately free + it. It should instead wait for the driver model core to call the + device's release method, then free the bus-specific object. + (There may be other code that is currently referencing the device + structure, and it would be rude to free the device while that is + happening). + + + When the device is registered, a directory in sysfs is created. + The PCI tree in sysfs looks like:: + + /sys/devices/pci0/ + |-- 00:00.0 + |-- 00:01.0 + | `-- 01:00.0 + |-- 00:02.0 + | `-- 02:1f.0 + | `-- 03:00.0 + |-- 00:1e.0 + | `-- 04:04.0 + |-- 00:1f.0 + |-- 00:1f.1 + | |-- ide0 + | | |-- 0.0 + | | `-- 0.1 + | `-- ide1 + | `-- 1.0 + |-- 00:1f.2 + |-- 00:1f.3 + `-- 00:1f.5 + + Also, symlinks are created in the bus's 'devices' directory + that point to the device's directory in the physical hierarchy:: + + /sys/bus/pci/devices/ + |-- 00:00.0 -> ../../../devices/pci0/00:00.0 + |-- 00:01.0 -> ../../../devices/pci0/00:01.0 + |-- 00:02.0 -> ../../../devices/pci0/00:02.0 + |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0 + |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0 + |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1 + |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2 + |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3 + |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5 + |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0 + |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0 + |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0 + `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0 + + + +Step 3: Registering Drivers. + +struct device_driver is a simple driver structure that contains a set +of operations that the driver model core may call. + + +- Embed a struct device_driver in the bus-specific driver. + + Just like with devices, do something like:: + + struct pci_driver { + ... + struct device_driver driver; + }; + + +- Initialize the generic driver structure. + + When the driver registers with the bus (e.g. doing pci_register_driver()), + initialize the necessary fields of the driver: the name and bus + fields. + + +- Register the driver. + + After the generic driver has been initialized, call:: + + driver_register(&drv->driver); + + to register the driver with the core. + + When the driver is unregistered from the bus, unregister it from the + core by doing:: + + driver_unregister(&drv->driver); + + Note that this will block until all references to the driver have + gone away. Normally, there will not be any. + + +- Sysfs representation. + + Drivers are exported via sysfs in their bus's 'driver's directory. + For example:: + + /sys/bus/pci/drivers/ + |-- 3c59x + |-- Ensoniq AudioPCI + |-- agpgart-amdk7 + |-- e100 + `-- serial + + +Step 4: Define Generic Methods for Drivers. + +struct device_driver defines a set of operations that the driver model +core calls. Most of these operations are probably similar to +operations the bus already defines for drivers, but taking different +parameters. + +It would be difficult and tedious to force every driver on a bus to +simultaneously convert their drivers to generic format. Instead, the +bus driver should define single instances of the generic methods that +forward call to the bus-specific drivers. For instance:: + + + static int pci_device_remove(struct device * dev) + { + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + + if (drv) { + if (drv->remove) + drv->remove(pci_dev); + pci_dev->driver = NULL; + } + return 0; + } + + +The generic driver should be initialized with these methods before it +is registered:: + + /* initialize common driver fields */ + drv->driver.name = drv->name; + drv->driver.bus = &pci_bus_type; + drv->driver.probe = pci_device_probe; + drv->driver.resume = pci_device_resume; + drv->driver.suspend = pci_device_suspend; + drv->driver.remove = pci_device_remove; + + /* register with core */ + driver_register(&drv->driver); + + +Ideally, the bus should only initialize the fields if they are not +already set. This allows the drivers to implement their own generic +methods. + + +Step 5: Support generic driver binding. + +The model assumes that a device or driver can be dynamically +registered with the bus at any time. When registration happens, +devices must be bound to a driver, or drivers must be bound to all +devices that it supports. + +A driver typically contains a list of device IDs that it supports. The +bus driver compares these IDs to the IDs of devices registered with it. +The format of the device IDs, and the semantics for comparing them are +bus-specific, so the generic model does attempt to generalize them. + +Instead, a bus may supply a method in struct bus_type that does the +comparison:: + + int (*match)(struct device * dev, struct device_driver * drv); + +match should return positive value if the driver supports the device, +and zero otherwise. It may also return error code (for example +-EPROBE_DEFER) if determining that given driver supports the device is +not possible. + +When a device is registered, the bus's list of drivers is iterated +over. bus->match() is called for each one until a match is found. + +When a driver is registered, the bus's list of devices is iterated +over. bus->match() is called for each device that is not already +claimed by a driver. + +When a device is successfully bound to a driver, device->driver is +set, the device is added to a per-driver list of devices, and a +symlink is created in the driver's sysfs directory that points to the +device's physical directory:: + + /sys/bus/pci/drivers/ + |-- 3c59x + | `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0 + |-- Ensoniq AudioPCI + |-- agpgart-amdk7 + | `-- 00:00.0 -> ../../../../devices/pci0/00:00.0 + |-- e100 + | `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0 + `-- serial + + +This driver binding should replace the existing driver binding +mechanism the bus currently uses. + + +Step 6: Supply a hotplug callback. + +Whenever a device is registered with the driver model core, the +userspace program /sbin/hotplug is called to notify userspace. +Users can define actions to perform when a device is inserted or +removed. + +The driver model core passes several arguments to userspace via +environment variables, including + +- ACTION: set to 'add' or 'remove' +- DEVPATH: set to the device's physical path in sysfs. + +A bus driver may also supply additional parameters for userspace to +consume. To do this, a bus must implement the 'hotplug' method in +struct bus_type:: + + int (*hotplug) (struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); + +This is called immediately before /sbin/hotplug is executed. + + +Step 7: Cleaning up the bus driver. + +The generic bus, device, and driver structures provide several fields +that can replace those defined privately to the bus driver. + +- Device list. + +struct bus_type contains a list of all devices registered with the bus +type. This includes all devices on all instances of that bus type. +An internal list that the bus uses may be removed, in favor of using +this one. + +The core provides an iterator to access these devices:: + + int bus_for_each_dev(struct bus_type * bus, struct device * start, + void * data, int (*fn)(struct device *, void *)); + + +- Driver list. + +struct bus_type also contains a list of all drivers registered with +it. An internal list of drivers that the bus driver maintains may +be removed in favor of using the generic one. + +The drivers may be iterated over, like devices:: + + int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, + void * data, int (*fn)(struct device_driver *, void *)); + + +Please see drivers/base/bus.c for more information. + + +- rwsem + +struct bus_type contains an rwsem that protects all core accesses to +the device and driver lists. This can be used by the bus driver +internally, and should be used when accessing the device or driver +lists the bus maintains. + + +- Device and driver fields. + +Some of the fields in struct device and struct device_driver duplicate +fields in the bus-specific representations of these objects. Feel free +to remove the bus-specific ones and favor the generic ones. Note +though, that this will likely mean fixing up all the drivers that +reference the bus-specific fields (though those should all be 1-line +changes). diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index 349f2dc33029..921c71a3d683 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -399,7 +399,7 @@ symbol: will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the callbacks need to embed the gpio_chip in its state container and obtain a pointer to the container using container_of(). - (See Documentation/driver-model/design-patterns.rst) + (See Documentation/driver-api/driver-model/design-patterns.rst) - gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip, as discussed above regarding different types of cascaded irqchips. The diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index b4c993ff7655..9fb03b7bdeb1 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -14,6 +14,7 @@ available subsections can be seen below. .. toctree:: :maxdepth: 2 + driver-model/index basics infrastructure early-userspace/index diff --git a/Documentation/driver-model/binding.rst b/Documentation/driver-model/binding.rst deleted file mode 100644 index 7ea1d7a41e1d..000000000000 --- a/Documentation/driver-model/binding.rst +++ /dev/null @@ -1,98 +0,0 @@ -============== -Driver Binding -============== - -Driver binding is the process of associating a device with a device -driver that can control it. Bus drivers have typically handled this -because there have been bus-specific structures to represent the -devices and the drivers. With generic device and device driver -structures, most of the binding can take place using common code. - - -Bus -~~~ - -The bus type structure contains a list of all devices that are on that bus -type in the system. When device_register is called for a device, it is -inserted into the end of this list. The bus object also contains a -list of all drivers of that bus type. When driver_register is called -for a driver, it is inserted at the end of this list. These are the -two events which trigger driver binding. - - -device_register -~~~~~~~~~~~~~~~ - -When a new device is added, the bus's list of drivers is iterated over -to find one that supports it. In order to determine that, the device -ID of the device must match one of the device IDs that the driver -supports. The format and semantics for comparing IDs is bus-specific. -Instead of trying to derive a complex state machine and matching -algorithm, it is up to the bus driver to provide a callback to compare -a device against the IDs of a driver. The bus returns 1 if a match was -found; 0 otherwise. - -int match(struct device * dev, struct device_driver * drv); - -If a match is found, the device's driver field is set to the driver -and the driver's probe callback is called. This gives the driver a -chance to verify that it really does support the hardware, and that -it's in a working state. - -Device Class -~~~~~~~~~~~~ - -Upon the successful completion of probe, the device is registered with -the class to which it belongs. Device drivers belong to one and only one -class, and that is set in the driver's devclass field. -devclass_add_device is called to enumerate the device within the class -and actually register it with the class, which happens with the -class's register_dev callback. - - -Driver -~~~~~~ - -When a driver is attached to a device, the device is inserted into the -driver's list of devices. - - -sysfs -~~~~~ - -A symlink is created in the bus's 'devices' directory that points to -the device's directory in the physical hierarchy. - -A symlink is created in the driver's 'devices' directory that points -to the device's directory in the physical hierarchy. - -A directory for the device is created in the class's directory. A -symlink is created in that directory that points to the device's -physical location in the sysfs tree. - -A symlink can be created (though this isn't done yet) in the device's -physical directory to either its class directory, or the class's -top-level directory. One can also be created to point to its driver's -directory also. - - -driver_register -~~~~~~~~~~~~~~~ - -The process is almost identical for when a new driver is added. -The bus's list of devices is iterated over to find a match. Devices -that already have a driver are skipped. All the devices are iterated -over, to bind as many devices as possible to the driver. - - -Removal -~~~~~~~ - -When a device is removed, the reference count for it will eventually -go to 0. When it does, the remove callback of the driver is called. It -is removed from the driver's list of devices and the reference count -of the driver is decremented. All symlinks between the two are removed. - -When a driver is removed, the list of devices that it supports is -iterated over, and the driver's remove callback is called for each -one. The device is removed from that list and the symlinks removed. diff --git a/Documentation/driver-model/bus.rst b/Documentation/driver-model/bus.rst deleted file mode 100644 index 016b15a6e8ea..000000000000 --- a/Documentation/driver-model/bus.rst +++ /dev/null @@ -1,146 +0,0 @@ -========= -Bus Types -========= - -Definition -~~~~~~~~~~ -See the kerneldoc for the struct bus_type. - -int bus_register(struct bus_type * bus); - - -Declaration -~~~~~~~~~~~ - -Each bus type in the kernel (PCI, USB, etc) should declare one static -object of this type. They must initialize the name field, and may -optionally initialize the match callback:: - - struct bus_type pci_bus_type = { - .name = "pci", - .match = pci_bus_match, - }; - -The structure should be exported to drivers in a header file: - -extern struct bus_type pci_bus_type; - - -Registration -~~~~~~~~~~~~ - -When a bus driver is initialized, it calls bus_register. This -initializes the rest of the fields in the bus object and inserts it -into a global list of bus types. Once the bus object is registered, -the fields in it are usable by the bus driver. - - -Callbacks -~~~~~~~~~ - -match(): Attaching Drivers to Devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The format of device ID structures and the semantics for comparing -them are inherently bus-specific. Drivers typically declare an array -of device IDs of devices they support that reside in a bus-specific -driver structure. - -The purpose of the match callback is to give the bus an opportunity to -determine if a particular driver supports a particular device by -comparing the device IDs the driver supports with the device ID of a -particular device, without sacrificing bus-specific functionality or -type-safety. - -When a driver is registered with the bus, the bus's list of devices is -iterated over, and the match callback is called for each device that -does not have a driver associated with it. - - - -Device and Driver Lists -~~~~~~~~~~~~~~~~~~~~~~~ - -The lists of devices and drivers are intended to replace the local -lists that many buses keep. They are lists of struct devices and -struct device_drivers, respectively. Bus drivers are free to use the -lists as they please, but conversion to the bus-specific type may be -necessary. - -The LDM core provides helper functions for iterating over each list:: - - int bus_for_each_dev(struct bus_type * bus, struct device * start, - void * data, - int (*fn)(struct device *, void *)); - - int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, - void * data, int (*fn)(struct device_driver *, void *)); - -These helpers iterate over the respective list, and call the callback -for each device or driver in the list. All list accesses are -synchronized by taking the bus's lock (read currently). The reference -count on each object in the list is incremented before the callback is -called; it is decremented after the next object has been obtained. The -lock is not held when calling the callback. - - -sysfs -~~~~~~~~ -There is a top-level directory named 'bus'. - -Each bus gets a directory in the bus directory, along with two default -directories:: - - /sys/bus/pci/ - |-- devices - `-- drivers - -Drivers registered with the bus get a directory in the bus's drivers -directory:: - - /sys/bus/pci/ - |-- devices - `-- drivers - |-- Intel ICH - |-- Intel ICH Joystick - |-- agpgart - `-- e100 - -Each device that is discovered on a bus of that type gets a symlink in -the bus's devices directory to the device's directory in the physical -hierarchy:: - - /sys/bus/pci/ - |-- devices - | |-- 00:00.0 -> ../../../root/pci0/00:00.0 - | |-- 00:01.0 -> ../../../root/pci0/00:01.0 - | `-- 00:02.0 -> ../../../root/pci0/00:02.0 - `-- drivers - - -Exporting Attributes -~~~~~~~~~~~~~~~~~~~~ - -:: - - struct bus_attribute { - struct attribute attr; - ssize_t (*show)(struct bus_type *, char * buf); - ssize_t (*store)(struct bus_type *, const char * buf, size_t count); - }; - -Bus drivers can export attributes using the BUS_ATTR_RW macro that works -similarly to the DEVICE_ATTR_RW macro for devices. For example, a -definition like this:: - - static BUS_ATTR_RW(debug); - -is equivalent to declaring:: - - static bus_attribute bus_attr_debug; - -This can then be used to add and remove the attribute from the bus's -sysfs directory using:: - - int bus_create_file(struct bus_type *, struct bus_attribute *); - void bus_remove_file(struct bus_type *, struct bus_attribute *); diff --git a/Documentation/driver-model/class.rst b/Documentation/driver-model/class.rst deleted file mode 100644 index fff55b80e86a..000000000000 --- a/Documentation/driver-model/class.rst +++ /dev/null @@ -1,149 +0,0 @@ -============== -Device Classes -============== - -Introduction -~~~~~~~~~~~~ -A device class describes a type of device, like an audio or network -device. The following device classes have been identified: - - - - -Each device class defines a set of semantics and a programming interface -that devices of that class adhere to. Device drivers are the -implementation of that programming interface for a particular device on -a particular bus. - -Device classes are agnostic with respect to what bus a device resides -on. - - -Programming Interface -~~~~~~~~~~~~~~~~~~~~~ -The device class structure looks like:: - - - typedef int (*devclass_add)(struct device *); - typedef void (*devclass_remove)(struct device *); - -See the kerneldoc for the struct class. - -A typical device class definition would look like:: - - struct device_class input_devclass = { - .name = "input", - .add_device = input_add_device, - .remove_device = input_remove_device, - }; - -Each device class structure should be exported in a header file so it -can be used by drivers, extensions and interfaces. - -Device classes are registered and unregistered with the core using:: - - int devclass_register(struct device_class * cls); - void devclass_unregister(struct device_class * cls); - - -Devices -~~~~~~~ -As devices are bound to drivers, they are added to the device class -that the driver belongs to. Before the driver model core, this would -typically happen during the driver's probe() callback, once the device -has been initialized. It now happens after the probe() callback -finishes from the core. - -The device is enumerated in the class. Each time a device is added to -the class, the class's devnum field is incremented and assigned to the -device. The field is never decremented, so if the device is removed -from the class and re-added, it will receive a different enumerated -value. - -The class is allowed to create a class-specific structure for the -device and store it in the device's class_data pointer. - -There is no list of devices in the device class. Each driver has a -list of devices that it supports. The device class has a list of -drivers of that particular class. To access all of the devices in the -class, iterate over the device lists of each driver in the class. - - -Device Drivers -~~~~~~~~~~~~~~ -Device drivers are added to device classes when they are registered -with the core. A driver specifies the class it belongs to by setting -the struct device_driver::devclass field. - - -sysfs directory structure -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There is a top-level sysfs directory named 'class'. - -Each class gets a directory in the class directory, along with two -default subdirectories:: - - class/ - `-- input - |-- devices - `-- drivers - - -Drivers registered with the class get a symlink in the drivers/ directory -that points to the driver's directory (under its bus directory):: - - class/ - `-- input - |-- devices - `-- drivers - `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/ - - -Each device gets a symlink in the devices/ directory that points to the -device's directory in the physical hierarchy:: - - class/ - `-- input - |-- devices - | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/ - `-- drivers - - -Exporting Attributes -~~~~~~~~~~~~~~~~~~~~ - -:: - - struct devclass_attribute { - struct attribute attr; - ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off); - ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off); - }; - -Class drivers can export attributes using the DEVCLASS_ATTR macro that works -similarly to the DEVICE_ATTR macro for devices. For example, a definition -like this:: - - static DEVCLASS_ATTR(debug,0644,show_debug,store_debug); - -is equivalent to declaring:: - - static devclass_attribute devclass_attr_debug; - -The bus driver can add and remove the attribute from the class's -sysfs directory using:: - - int devclass_create_file(struct device_class *, struct devclass_attribute *); - void devclass_remove_file(struct device_class *, struct devclass_attribute *); - -In the example above, the file will be named 'debug' in placed in the -class's directory in sysfs. - - -Interfaces -~~~~~~~~~~ -There may exist multiple mechanisms for accessing the same device of a -particular class type. Device interfaces describe these mechanisms. - -When a device is added to a device class, the core attempts to add it -to every interface that is registered with the device class. diff --git a/Documentation/driver-model/design-patterns.rst b/Documentation/driver-model/design-patterns.rst deleted file mode 100644 index 41eb8f41f7dd..000000000000 --- a/Documentation/driver-model/design-patterns.rst +++ /dev/null @@ -1,116 +0,0 @@ -============================= -Device Driver Design Patterns -============================= - -This document describes a few common design patterns found in device drivers. -It is likely that subsystem maintainers will ask driver developers to -conform to these design patterns. - -1. State Container -2. container_of() - - -1. State Container -~~~~~~~~~~~~~~~~~~ - -While the kernel contains a few device drivers that assume that they will -only be probed() once on a certain system (singletons), it is custom to assume -that the device the driver binds to will appear in several instances. This -means that the probe() function and all callbacks need to be reentrant. - -The most common way to achieve this is to use the state container design -pattern. It usually has this form:: - - struct foo { - spinlock_t lock; /* Example member */ - (...) - }; - - static int foo_probe(...) - { - struct foo *foo; - - foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL); - if (!foo) - return -ENOMEM; - spin_lock_init(&foo->lock); - (...) - } - -This will create an instance of struct foo in memory every time probe() is -called. This is our state container for this instance of the device driver. -Of course it is then necessary to always pass this instance of the -state around to all functions that need access to the state and its members. - -For example, if the driver is registering an interrupt handler, you would -pass around a pointer to struct foo like this:: - - static irqreturn_t foo_handler(int irq, void *arg) - { - struct foo *foo = arg; - (...) - } - - static int foo_probe(...) - { - struct foo *foo; - - (...) - ret = request_irq(irq, foo_handler, 0, "foo", foo); - } - -This way you always get a pointer back to the correct instance of foo in -your interrupt handler. - - -2. container_of() -~~~~~~~~~~~~~~~~~ - -Continuing on the above example we add an offloaded work:: - - struct foo { - spinlock_t lock; - struct workqueue_struct *wq; - struct work_struct offload; - (...) - }; - - static void foo_work(struct work_struct *work) - { - struct foo *foo = container_of(work, struct foo, offload); - - (...) - } - - static irqreturn_t foo_handler(int irq, void *arg) - { - struct foo *foo = arg; - - queue_work(foo->wq, &foo->offload); - (...) - } - - static int foo_probe(...) - { - struct foo *foo; - - foo->wq = create_singlethread_workqueue("foo-wq"); - INIT_WORK(&foo->offload, foo_work); - (...) - } - -The design pattern is the same for an hrtimer or something similar that will -return a single argument which is a pointer to a struct member in the -callback. - -container_of() is a macro defined in - -What container_of() does is to obtain a pointer to the containing struct from -a pointer to a member by a simple subtraction using the offsetof() macro from -standard C, which allows something similar to object oriented behaviours. -Notice that the contained member must not be a pointer, but an actual member -for this to work. - -We can see here that we avoid having global pointers to our struct foo * -instance this way, while still keeping the number of parameters passed to the -work function to a single pointer. diff --git a/Documentation/driver-model/device.rst b/Documentation/driver-model/device.rst deleted file mode 100644 index 2b868d49d349..000000000000 --- a/Documentation/driver-model/device.rst +++ /dev/null @@ -1,109 +0,0 @@ -========================== -The Basic Device Structure -========================== - -See the kerneldoc for the struct device. - - -Programming Interface -~~~~~~~~~~~~~~~~~~~~~ -The bus driver that discovers the device uses this to register the -device with the core:: - - int device_register(struct device * dev); - -The bus should initialize the following fields: - - - parent - - name - - bus_id - - bus - -A device is removed from the core when its reference count goes to -0. The reference count can be adjusted using:: - - struct device * get_device(struct device * dev); - void put_device(struct device * dev); - -get_device() will return a pointer to the struct device passed to it -if the reference is not already 0 (if it's in the process of being -removed already). - -A driver can access the lock in the device structure using:: - - void lock_device(struct device * dev); - void unlock_device(struct device * dev); - - -Attributes -~~~~~~~~~~ - -:: - - struct device_attribute { - struct attribute attr; - ssize_t (*show)(struct device *dev, struct device_attribute *attr, - char *buf); - ssize_t (*store)(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count); - }; - -Attributes of devices can be exported by a device driver through sysfs. - -Please see Documentation/filesystems/sysfs.txt for more information -on how sysfs works. - -As explained in Documentation/kobject.txt, device attributes must be -created before the KOBJ_ADD uevent is generated. The only way to realize -that is by defining an attribute group. - -Attributes are declared using a macro called DEVICE_ATTR:: - - #define DEVICE_ATTR(name,mode,show,store) - -Example::: - - static DEVICE_ATTR(type, 0444, show_type, NULL); - static DEVICE_ATTR(power, 0644, show_power, store_power); - -This declares two structures of type struct device_attribute with respective -names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be -organized as follows into a group:: - - static struct attribute *dev_attrs[] = { - &dev_attr_type.attr, - &dev_attr_power.attr, - NULL, - }; - - static struct attribute_group dev_attr_group = { - .attrs = dev_attrs, - }; - - static const struct attribute_group *dev_attr_groups[] = { - &dev_attr_group, - NULL, - }; - -This array of groups can then be associated with a device by setting the -group pointer in struct device before device_register() is invoked:: - - dev->groups = dev_attr_groups; - device_register(dev); - -The device_register() function will use the 'groups' pointer to create the -device attributes and the device_unregister() function will use this pointer -to remove the device attributes. - -Word of warning: While the kernel allows device_create_file() and -device_remove_file() to be called on a device at any time, userspace has -strict expectations on when attributes get created. When a new device is -registered in the kernel, a uevent is generated to notify userspace (like -udev) that a new device is available. If attributes are added after the -device is registered, then userspace won't get notified and userspace will -not know about the new attributes. - -This is important for device driver that need to publish additional -attributes for a device at driver probe time. If the device driver simply -calls device_create_file() on the device structure passed to it, then -userspace will never be notified of the new attributes. diff --git a/Documentation/driver-model/devres.rst b/Documentation/driver-model/devres.rst deleted file mode 100644 index 4ac99122b5f1..000000000000 --- a/Documentation/driver-model/devres.rst +++ /dev/null @@ -1,414 +0,0 @@ -================================ -Devres - Managed Device Resource -================================ - -Tejun Heo - -First draft 10 January 2007 - -.. contents - - 1. Intro : Huh? Devres? - 2. Devres : Devres in a nutshell - 3. Devres Group : Group devres'es and release them together - 4. Details : Life time rules, calling context, ... - 5. Overhead : How much do we have to pay for this? - 6. List of managed interfaces: Currently implemented managed interfaces - - -1. Intro --------- - -devres came up while trying to convert libata to use iomap. Each -iomapped address should be kept and unmapped on driver detach. For -example, a plain SFF ATA controller (that is, good old PCI IDE) in -native mode makes use of 5 PCI BARs and all of them should be -maintained. - -As with many other device drivers, libata low level drivers have -sufficient bugs in ->remove and ->probe failure path. Well, yes, -that's probably because libata low level driver developers are lazy -bunch, but aren't all low level driver developers? After spending a -day fiddling with braindamaged hardware with no document or -braindamaged document, if it's finally working, well, it's working. - -For one reason or another, low level drivers don't receive as much -attention or testing as core code, and bugs on driver detach or -initialization failure don't happen often enough to be noticeable. -Init failure path is worse because it's much less travelled while -needs to handle multiple entry points. - -So, many low level drivers end up leaking resources on driver detach -and having half broken failure path implementation in ->probe() which -would leak resources or even cause oops when failure occurs. iomap -adds more to this mix. So do msi and msix. - - -2. Devres ---------- - -devres is basically linked list of arbitrarily sized memory areas -associated with a struct device. Each devres entry is associated with -a release function. A devres can be released in several ways. No -matter what, all devres entries are released on driver detach. On -release, the associated release function is invoked and then the -devres entry is freed. - -Managed interface is created for resources commonly used by device -drivers using devres. For example, coherent DMA memory is acquired -using dma_alloc_coherent(). The managed version is called -dmam_alloc_coherent(). It is identical to dma_alloc_coherent() except -for the DMA memory allocated using it is managed and will be -automatically released on driver detach. Implementation looks like -the following:: - - struct dma_devres { - size_t size; - void *vaddr; - dma_addr_t dma_handle; - }; - - static void dmam_coherent_release(struct device *dev, void *res) - { - struct dma_devres *this = res; - - dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle); - } - - dmam_alloc_coherent(dev, size, dma_handle, gfp) - { - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp); - ... - - /* alloc DMA memory as usual */ - vaddr = dma_alloc_coherent(...); - ... - - /* record size, vaddr, dma_handle in dr */ - dr->vaddr = vaddr; - ... - - devres_add(dev, dr); - - return vaddr; - } - -If a driver uses dmam_alloc_coherent(), the area is guaranteed to be -freed whether initialization fails half-way or the device gets -detached. If most resources are acquired using managed interface, a -driver can have much simpler init and exit code. Init path basically -looks like the following:: - - my_init_one() - { - struct mydev *d; - - d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL); - if (!d) - return -ENOMEM; - - d->ring = dmam_alloc_coherent(...); - if (!d->ring) - return -ENOMEM; - - if (check something) - return -EINVAL; - ... - - return register_to_upper_layer(d); - } - -And exit path:: - - my_remove_one() - { - unregister_from_upper_layer(d); - shutdown_my_hardware(); - } - -As shown above, low level drivers can be simplified a lot by using -devres. Complexity is shifted from less maintained low level drivers -to better maintained higher layer. Also, as init failure path is -shared with exit path, both can get more testing. - -Note though that when converting current calls or assignments to -managed devm_* versions it is up to you to check if internal operations -like allocating memory, have failed. Managed resources pertains to the -freeing of these resources *only* - all other checks needed are still -on you. In some cases this may mean introducing checks that were not -necessary before moving to the managed devm_* calls. - - -3. Devres group ---------------- - -Devres entries can be grouped using devres group. When a group is -released, all contained normal devres entries and properly nested -groups are released. One usage is to rollback series of acquired -resources on failure. For example:: - - if (!devres_open_group(dev, NULL, GFP_KERNEL)) - return -ENOMEM; - - acquire A; - if (failed) - goto err; - - acquire B; - if (failed) - goto err; - ... - - devres_remove_group(dev, NULL); - return 0; - - err: - devres_release_group(dev, NULL); - return err_code; - -As resource acquisition failure usually means probe failure, constructs -like above are usually useful in midlayer driver (e.g. libata core -layer) where interface function shouldn't have side effect on failure. -For LLDs, just returning error code suffices in most cases. - -Each group is identified by `void *id`. It can either be explicitly -specified by @id argument to devres_open_group() or automatically -created by passing NULL as @id as in the above example. In both -cases, devres_open_group() returns the group's id. The returned id -can be passed to other devres functions to select the target group. -If NULL is given to those functions, the latest open group is -selected. - -For example, you can do something like the following:: - - int my_midlayer_create_something() - { - if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL)) - return -ENOMEM; - - ... - - devres_close_group(dev, my_midlayer_create_something); - return 0; - } - - void my_midlayer_destroy_something() - { - devres_release_group(dev, my_midlayer_create_something); - } - - -4. Details ----------- - -Lifetime of a devres entry begins on devres allocation and finishes -when it is released or destroyed (removed and freed) - no reference -counting. - -devres core guarantees atomicity to all basic devres operations and -has support for single-instance devres types (atomic -lookup-and-add-if-not-found). Other than that, synchronizing -concurrent accesses to allocated devres data is caller's -responsibility. This is usually non-issue because bus ops and -resource allocations already do the job. - -For an example of single-instance devres type, read pcim_iomap_table() -in lib/devres.c. - -All devres interface functions can be called without context if the -right gfp mask is given. - - -5. Overhead ------------ - -Each devres bookkeeping info is allocated together with requested data -area. With debug option turned off, bookkeeping info occupies 16 -bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded -up to ull alignment). If singly linked list is used, it can be -reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit). - -Each devres group occupies 8 pointers. It can be reduced to 6 if -singly linked list is used. - -Memory space overhead on ahci controller with two ports is between 300 -and 400 bytes on 32bit machine after naive conversion (we can -certainly invest a bit more effort into libata core layer). - - -6. List of managed interfaces ------------------------------ - -CLOCK - devm_clk_get() - devm_clk_get_optional() - devm_clk_put() - devm_clk_hw_register() - devm_of_clk_add_hw_provider() - devm_clk_hw_register_clkdev() - -DMA - dmaenginem_async_device_register() - dmam_alloc_coherent() - dmam_alloc_attrs() - dmam_free_coherent() - dmam_pool_create() - dmam_pool_destroy() - -DRM - devm_drm_dev_init() - -GPIO - devm_gpiod_get() - devm_gpiod_get_index() - devm_gpiod_get_index_optional() - devm_gpiod_get_optional() - devm_gpiod_put() - devm_gpiod_unhinge() - devm_gpiochip_add_data() - devm_gpio_request() - devm_gpio_request_one() - devm_gpio_free() - -I2C - devm_i2c_new_dummy_device() - -IIO - devm_iio_device_alloc() - devm_iio_device_free() - devm_iio_device_register() - devm_iio_device_unregister() - devm_iio_kfifo_allocate() - devm_iio_kfifo_free() - devm_iio_triggered_buffer_setup() - devm_iio_triggered_buffer_cleanup() - devm_iio_trigger_alloc() - devm_iio_trigger_free() - devm_iio_trigger_register() - devm_iio_trigger_unregister() - devm_iio_channel_get() - devm_iio_channel_release() - devm_iio_channel_get_all() - devm_iio_channel_release_all() - -INPUT - devm_input_allocate_device() - -IO region - devm_release_mem_region() - devm_release_region() - devm_release_resource() - devm_request_mem_region() - devm_request_region() - devm_request_resource() - -IOMAP - devm_ioport_map() - devm_ioport_unmap() - devm_ioremap() - devm_ioremap_nocache() - devm_ioremap_wc() - devm_ioremap_resource() : checks resource, requests memory region, ioremaps - devm_iounmap() - pcim_iomap() - pcim_iomap_regions() : do request_region() and iomap() on multiple BARs - pcim_iomap_table() : array of mapped addresses indexed by BAR - pcim_iounmap() - -IRQ - devm_free_irq() - devm_request_any_context_irq() - devm_request_irq() - devm_request_threaded_irq() - devm_irq_alloc_descs() - devm_irq_alloc_desc() - devm_irq_alloc_desc_at() - devm_irq_alloc_desc_from() - devm_irq_alloc_descs_from() - devm_irq_alloc_generic_chip() - devm_irq_setup_generic_chip() - devm_irq_sim_init() - -LED - devm_led_classdev_register() - devm_led_classdev_unregister() - -MDIO - devm_mdiobus_alloc() - devm_mdiobus_alloc_size() - devm_mdiobus_free() - -MEM - devm_free_pages() - devm_get_free_pages() - devm_kasprintf() - devm_kcalloc() - devm_kfree() - devm_kmalloc() - devm_kmalloc_array() - devm_kmemdup() - devm_kstrdup() - devm_kvasprintf() - devm_kzalloc() - -MFD - devm_mfd_add_devices() - -MUX - devm_mux_chip_alloc() - devm_mux_chip_register() - devm_mux_control_get() - -PER-CPU MEM - devm_alloc_percpu() - devm_free_percpu() - -PCI - devm_pci_alloc_host_bridge() : managed PCI host bridge allocation - devm_pci_remap_cfgspace() : ioremap PCI configuration space - devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource - pcim_enable_device() : after success, all PCI ops become managed - pcim_pin_device() : keep PCI device enabled after release - -PHY - devm_usb_get_phy() - devm_usb_put_phy() - -PINCTRL - devm_pinctrl_get() - devm_pinctrl_put() - devm_pinctrl_register() - devm_pinctrl_unregister() - -POWER - devm_reboot_mode_register() - devm_reboot_mode_unregister() - -PWM - devm_pwm_get() - devm_pwm_put() - -REGULATOR - devm_regulator_bulk_get() - devm_regulator_get() - devm_regulator_put() - devm_regulator_register() - -RESET - devm_reset_control_get() - devm_reset_controller_register() - -SERDEV - devm_serdev_device_open() - -SLAVE DMA ENGINE - devm_acpi_dma_controller_register() - -SPI - devm_spi_register_master() - -WATCHDOG - devm_watchdog_register_device() diff --git a/Documentation/driver-model/driver.rst b/Documentation/driver-model/driver.rst deleted file mode 100644 index 11d281506a04..000000000000 --- a/Documentation/driver-model/driver.rst +++ /dev/null @@ -1,223 +0,0 @@ -============== -Device Drivers -============== - -See the kerneldoc for the struct device_driver. - - -Allocation -~~~~~~~~~~ - -Device drivers are statically allocated structures. Though there may -be multiple devices in a system that a driver supports, struct -device_driver represents the driver as a whole (not a particular -device instance). - -Initialization -~~~~~~~~~~~~~~ - -The driver must initialize at least the name and bus fields. It should -also initialize the devclass field (when it arrives), so it may obtain -the proper linkage internally. It should also initialize as many of -the callbacks as possible, though each is optional. - -Declaration -~~~~~~~~~~~ - -As stated above, struct device_driver objects are statically -allocated. Below is an example declaration of the eepro100 -driver. This declaration is hypothetical only; it relies on the driver -being converted completely to the new model:: - - static struct device_driver eepro100_driver = { - .name = "eepro100", - .bus = &pci_bus_type, - - .probe = eepro100_probe, - .remove = eepro100_remove, - .suspend = eepro100_suspend, - .resume = eepro100_resume, - }; - -Most drivers will not be able to be converted completely to the new -model because the bus they belong to has a bus-specific structure with -bus-specific fields that cannot be generalized. - -The most common example of this are device ID structures. A driver -typically defines an array of device IDs that it supports. The format -of these structures and the semantics for comparing device IDs are -completely bus-specific. Defining them as bus-specific entities would -sacrifice type-safety, so we keep bus-specific structures around. - -Bus-specific drivers should include a generic struct device_driver in -the definition of the bus-specific driver. Like this:: - - struct pci_driver { - const struct pci_device_id *id_table; - struct device_driver driver; - }; - -A definition that included bus-specific fields would look like -(using the eepro100 driver again):: - - static struct pci_driver eepro100_driver = { - .id_table = eepro100_pci_tbl, - .driver = { - .name = "eepro100", - .bus = &pci_bus_type, - .probe = eepro100_probe, - .remove = eepro100_remove, - .suspend = eepro100_suspend, - .resume = eepro100_resume, - }, - }; - -Some may find the syntax of embedded struct initialization awkward or -even a bit ugly. So far, it's the best way we've found to do what we want... - -Registration -~~~~~~~~~~~~ - -:: - - int driver_register(struct device_driver *drv); - -The driver registers the structure on startup. For drivers that have -no bus-specific fields (i.e. don't have a bus-specific driver -structure), they would use driver_register and pass a pointer to their -struct device_driver object. - -Most drivers, however, will have a bus-specific structure and will -need to register with the bus using something like pci_driver_register. - -It is important that drivers register their driver structure as early as -possible. Registration with the core initializes several fields in the -struct device_driver object, including the reference count and the -lock. These fields are assumed to be valid at all times and may be -used by the device model core or the bus driver. - - -Transition Bus Drivers -~~~~~~~~~~~~~~~~~~~~~~ - -By defining wrapper functions, the transition to the new model can be -made easier. Drivers can ignore the generic structure altogether and -let the bus wrapper fill in the fields. For the callbacks, the bus can -define generic callbacks that forward the call to the bus-specific -callbacks of the drivers. - -This solution is intended to be only temporary. In order to get class -information in the driver, the drivers must be modified anyway. Since -converting drivers to the new model should reduce some infrastructural -complexity and code size, it is recommended that they are converted as -class information is added. - -Access -~~~~~~ - -Once the object has been registered, it may access the common fields of -the object, like the lock and the list of devices:: - - int driver_for_each_dev(struct device_driver *drv, void *data, - int (*callback)(struct device *dev, void *data)); - -The devices field is a list of all the devices that have been bound to -the driver. The LDM core provides a helper function to operate on all -the devices a driver controls. This helper locks the driver on each -node access, and does proper reference counting on each device as it -accesses it. - - -sysfs -~~~~~ - -When a driver is registered, a sysfs directory is created in its -bus's directory. In this directory, the driver can export an interface -to userspace to control operation of the driver on a global basis; -e.g. toggling debugging output in the driver. - -A future feature of this directory will be a 'devices' directory. This -directory will contain symlinks to the directories of devices it -supports. - - - -Callbacks -~~~~~~~~~ - -:: - - int (*probe) (struct device *dev); - -The probe() entry is called in task context, with the bus's rwsem locked -and the driver partially bound to the device. Drivers commonly use -container_of() to convert "dev" to a bus-specific type, both in probe() -and other routines. That type often provides device resource data, such -as pci_dev.resource[] or platform_device.resources, which is used in -addition to dev->platform_data to initialize the driver. - -This callback holds the driver-specific logic to bind the driver to a -given device. That includes verifying that the device is present, that -it's a version the driver can handle, that driver data structures can -be allocated and initialized, and that any hardware can be initialized. -Drivers often store a pointer to their state with dev_set_drvdata(). -When the driver has successfully bound itself to that device, then probe() -returns zero and the driver model code will finish its part of binding -the driver to that device. - -A driver's probe() may return a negative errno value to indicate that -the driver did not bind to this device, in which case it should have -released all resources it allocated:: - - int (*remove) (struct device *dev); - -remove is called to unbind a driver from a device. This may be -called if a device is physically removed from the system, if the -driver module is being unloaded, during a reboot sequence, or -in other cases. - -It is up to the driver to determine if the device is present or -not. It should free any resources allocated specifically for the -device; i.e. anything in the device's driver_data field. - -If the device is still present, it should quiesce the device and place -it into a supported low-power state:: - - int (*suspend) (struct device *dev, pm_message_t state); - -suspend is called to put the device in a low power state:: - - int (*resume) (struct device *dev); - -Resume is used to bring a device back from a low power state. - - -Attributes -~~~~~~~~~~ - -:: - - struct driver_attribute { - struct attribute attr; - ssize_t (*show)(struct device_driver *driver, char *buf); - ssize_t (*store)(struct device_driver *, const char *buf, size_t count); - }; - -Device drivers can export attributes via their sysfs directories. -Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO -macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO -macros. - -Example:: - - DRIVER_ATTR_RW(debug); - -This is equivalent to declaring:: - - struct driver_attribute driver_attr_debug; - -This can then be used to add and remove the attribute from the -driver's directory using:: - - int driver_create_file(struct device_driver *, const struct driver_attribute *); - void driver_remove_file(struct device_driver *, const struct driver_attribute *); diff --git a/Documentation/driver-model/index.rst b/Documentation/driver-model/index.rst deleted file mode 100644 index 9f85d579ce56..000000000000 --- a/Documentation/driver-model/index.rst +++ /dev/null @@ -1,26 +0,0 @@ -:orphan: - -============ -Driver Model -============ - -.. toctree:: - :maxdepth: 1 - - binding - bus - class - design-patterns - device - devres - driver - overview - platform - porting - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/driver-model/overview.rst b/Documentation/driver-model/overview.rst deleted file mode 100644 index d4d1e9b40e0c..000000000000 --- a/Documentation/driver-model/overview.rst +++ /dev/null @@ -1,124 +0,0 @@ -============================= -The Linux Kernel Device Model -============================= - -Patrick Mochel - -Drafted 26 August 2002 -Updated 31 January 2006 - - -Overview -~~~~~~~~ - -The Linux Kernel Driver Model is a unification of all the disparate driver -models that were previously used in the kernel. It is intended to augment the -bus-specific drivers for bridges and devices by consolidating a set of data -and operations into globally accessible data structures. - -Traditional driver models implemented some sort of tree-like structure -(sometimes just a list) for the devices they control. There wasn't any -uniformity across the different bus types. - -The current driver model provides a common, uniform data model for describing -a bus and the devices that can appear under the bus. The unified bus -model includes a set of common attributes which all busses carry, and a set -of common callbacks, such as device discovery during bus probing, bus -shutdown, bus power management, etc. - -The common device and bridge interface reflects the goals of the modern -computer: namely the ability to do seamless device "plug and play", power -management, and hot plug. In particular, the model dictated by Intel and -Microsoft (namely ACPI) ensures that almost every device on almost any bus -on an x86-compatible system can work within this paradigm. Of course, -not every bus is able to support all such operations, although most -buses support most of those operations. - - -Downstream Access -~~~~~~~~~~~~~~~~~ - -Common data fields have been moved out of individual bus layers into a common -data structure. These fields must still be accessed by the bus layers, -and sometimes by the device-specific drivers. - -Other bus layers are encouraged to do what has been done for the PCI layer. -struct pci_dev now looks like this:: - - struct pci_dev { - ... - - struct device dev; /* Generic device interface */ - ... - }; - -Note first that the struct device dev within the struct pci_dev is -statically allocated. This means only one allocation on device discovery. - -Note also that that struct device dev is not necessarily defined at the -front of the pci_dev structure. This is to make people think about what -they're doing when switching between the bus driver and the global driver, -and to discourage meaningless and incorrect casts between the two. - -The PCI bus layer freely accesses the fields of struct device. It knows about -the structure of struct pci_dev, and it should know the structure of struct -device. Individual PCI device drivers that have been converted to the current -driver model generally do not and should not touch the fields of struct device, -unless there is a compelling reason to do so. - -The above abstraction prevents unnecessary pain during transitional phases. -If it were not done this way, then when a field was renamed or removed, every -downstream driver would break. On the other hand, if only the bus layer -(and not the device layer) accesses the struct device, it is only the bus -layer that needs to change. - - -User Interface -~~~~~~~~~~~~~~ - -By virtue of having a complete hierarchical view of all the devices in the -system, exporting a complete hierarchical view to userspace becomes relatively -easy. This has been accomplished by implementing a special purpose virtual -file system named sysfs. - -Almost all mainstream Linux distros mount this filesystem automatically; you -can see some variation of the following in the output of the "mount" command:: - - $ mount - ... - none on /sys type sysfs (rw,noexec,nosuid,nodev) - ... - $ - -The auto-mounting of sysfs is typically accomplished by an entry similar to -the following in the /etc/fstab file:: - - none /sys sysfs defaults 0 0 - -or something similar in the /lib/init/fstab file on Debian-based systems:: - - none /sys sysfs nodev,noexec,nosuid 0 0 - -If sysfs is not automatically mounted, you can always do it manually with:: - - # mount -t sysfs sysfs /sys - -Whenever a device is inserted into the tree, a directory is created for it. -This directory may be populated at each layer of discovery - the global layer, -the bus layer, or the device layer. - -The global layer currently creates two files - 'name' and 'power'. The -former only reports the name of the device. The latter reports the -current power state of the device. It will also be used to set the current -power state. - -The bus layer may also create files for the devices it finds while probing the -bus. For example, the PCI layer currently creates 'irq' and 'resource' files -for each PCI device. - -A device-specific driver may also export files in its directory to expose -device-specific data or tunable interfaces. - -More information about the sysfs directory layout can be found in -the other documents in this directory and in the file -Documentation/filesystems/sysfs.txt. diff --git a/Documentation/driver-model/platform.rst b/Documentation/driver-model/platform.rst deleted file mode 100644 index 334dd4071ae4..000000000000 --- a/Documentation/driver-model/platform.rst +++ /dev/null @@ -1,246 +0,0 @@ -============================ -Platform Devices and Drivers -============================ - -See for the driver model interface to the -platform bus: platform_device, and platform_driver. This pseudo-bus -is used to connect devices on busses with minimal infrastructure, -like those used to integrate peripherals on many system-on-chip -processors, or some "legacy" PC interconnects; as opposed to large -formally specified ones like PCI or USB. - - -Platform devices -~~~~~~~~~~~~~~~~ -Platform devices are devices that typically appear as autonomous -entities in the system. This includes legacy port-based devices and -host bridges to peripheral buses, and most controllers integrated -into system-on-chip platforms. What they usually have in common -is direct addressing from a CPU bus. Rarely, a platform_device will -be connected through a segment of some other kind of bus; but its -registers will still be directly addressable. - -Platform devices are given a name, used in driver binding, and a -list of resources such as addresses and IRQs:: - - struct platform_device { - const char *name; - u32 id; - struct device dev; - u32 num_resources; - struct resource *resource; - }; - - -Platform drivers -~~~~~~~~~~~~~~~~ -Platform drivers follow the standard driver model convention, where -discovery/enumeration is handled outside the drivers, and drivers -provide probe() and remove() methods. They support power management -and shutdown notifications using the standard conventions:: - - struct platform_driver { - int (*probe)(struct platform_device *); - int (*remove)(struct platform_device *); - void (*shutdown)(struct platform_device *); - int (*suspend)(struct platform_device *, pm_message_t state); - int (*suspend_late)(struct platform_device *, pm_message_t state); - int (*resume_early)(struct platform_device *); - int (*resume)(struct platform_device *); - struct device_driver driver; - }; - -Note that probe() should in general verify that the specified device hardware -actually exists; sometimes platform setup code can't be sure. The probing -can use device resources, including clocks, and device platform_data. - -Platform drivers register themselves the normal way:: - - int platform_driver_register(struct platform_driver *drv); - -Or, in common situations where the device is known not to be hot-pluggable, -the probe() routine can live in an init section to reduce the driver's -runtime memory footprint:: - - int platform_driver_probe(struct platform_driver *drv, - int (*probe)(struct platform_device *)) - -Kernel modules can be composed of several platform drivers. The platform core -provides helpers to register and unregister an array of drivers:: - - int __platform_register_drivers(struct platform_driver * const *drivers, - unsigned int count, struct module *owner); - void platform_unregister_drivers(struct platform_driver * const *drivers, - unsigned int count); - -If one of the drivers fails to register, all drivers registered up to that -point will be unregistered in reverse order. Note that there is a convenience -macro that passes THIS_MODULE as owner parameter:: - - #define platform_register_drivers(drivers, count) - - -Device Enumeration -~~~~~~~~~~~~~~~~~~ -As a rule, platform specific (and often board-specific) setup code will -register platform devices:: - - int platform_device_register(struct platform_device *pdev); - - int platform_add_devices(struct platform_device **pdevs, int ndev); - -The general rule is to register only those devices that actually exist, -but in some cases extra devices might be registered. For example, a kernel -might be configured to work with an external network adapter that might not -be populated on all boards, or likewise to work with an integrated controller -that some boards might not hook up to any peripherals. - -In some cases, boot firmware will export tables describing the devices -that are populated on a given board. Without such tables, often the -only way for system setup code to set up the correct devices is to build -a kernel for a specific target board. Such board-specific kernels are -common with embedded and custom systems development. - -In many cases, the memory and IRQ resources associated with the platform -device are not enough to let the device's driver work. Board setup code -will often provide additional information using the device's platform_data -field to hold additional information. - -Embedded systems frequently need one or more clocks for platform devices, -which are normally kept off until they're actively needed (to save power). -System setup also associates those clocks with the device, so that that -calls to clk_get(&pdev->dev, clock_name) return them as needed. - - -Legacy Drivers: Device Probing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some drivers are not fully converted to the driver model, because they take -on a non-driver role: the driver registers its platform device, rather than -leaving that for system infrastructure. Such drivers can't be hotplugged -or coldplugged, since those mechanisms require device creation to be in a -different system component than the driver. - -The only "good" reason for this is to handle older system designs which, like -original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware -configuration. Newer systems have largely abandoned that model, in favor of -bus-level support for dynamic configuration (PCI, USB), or device tables -provided by the boot firmware (e.g. PNPACPI on x86). There are too many -conflicting options about what might be where, and even educated guesses by -an operating system will be wrong often enough to make trouble. - -This style of driver is discouraged. If you're updating such a driver, -please try to move the device enumeration to a more appropriate location, -outside the driver. This will usually be cleanup, since such drivers -tend to already have "normal" modes, such as ones using device nodes that -were created by PNP or by platform device setup. - -None the less, there are some APIs to support such legacy drivers. Avoid -using these calls except with such hotplug-deficient drivers:: - - struct platform_device *platform_device_alloc( - const char *name, int id); - -You can use platform_device_alloc() to dynamically allocate a device, which -you will then initialize with resources and platform_device_register(). -A better solution is usually:: - - struct platform_device *platform_device_register_simple( - const char *name, int id, - struct resource *res, unsigned int nres); - -You can use platform_device_register_simple() as a one-step call to allocate -and register a device. - - -Device Naming and Driver Binding -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The platform_device.dev.bus_id is the canonical name for the devices. -It's built from two components: - - * platform_device.name ... which is also used to for driver matching. - - * platform_device.id ... the device instance number, or else "-1" - to indicate there's only one. - -These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and -"serial/3" indicates bus_id "serial.3"; both would use the platform_driver -named "serial". While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id) -and use the platform_driver called "my_rtc". - -Driver binding is performed automatically by the driver core, invoking -driver probe() after finding a match between device and driver. If the -probe() succeeds, the driver and device are bound as usual. There are -three different ways to find such a match: - - - Whenever a device is registered, the drivers for that bus are - checked for matches. Platform devices should be registered very - early during system boot. - - - When a driver is registered using platform_driver_register(), all - unbound devices on that bus are checked for matches. Drivers - usually register later during booting, or by module loading. - - - Registering a driver using platform_driver_probe() works just like - using platform_driver_register(), except that the driver won't - be probed later if another device registers. (Which is OK, since - this interface is only for use with non-hotpluggable devices.) - - -Early Platform Devices and Drivers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The early platform interfaces provide platform data to platform device -drivers early on during the system boot. The code is built on top of the -early_param() command line parsing and can be executed very early on. - -Example: "earlyprintk" class early serial console in 6 steps - -1. Registering early platform device data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The architecture code registers platform device data using the function -early_platform_add_devices(). In the case of early serial console this -should be hardware configuration for the serial port. Devices registered -at this point will later on be matched against early platform drivers. - -2. Parsing kernel command line -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The architecture code calls parse_early_param() to parse the kernel -command line. This will execute all matching early_param() callbacks. -User specified early platform devices will be registered at this point. -For the early serial console case the user can specify port on the -kernel command line as "earlyprintk=serial.0" where "earlyprintk" is -the class string, "serial" is the name of the platform driver and -0 is the platform device id. If the id is -1 then the dot and the -id can be omitted. - -3. Installing early platform drivers belonging to a certain class -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The architecture code may optionally force registration of all early -platform drivers belonging to a certain class using the function -early_platform_driver_register_all(). User specified devices from -step 2 have priority over these. This step is omitted by the serial -driver example since the early serial driver code should be disabled -unless the user has specified port on the kernel command line. - -4. Early platform driver registration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Compiled-in platform drivers making use of early_platform_init() are -automatically registered during step 2 or 3. The serial driver example -should use early_platform_init("earlyprintk", &platform_driver). - -5. Probing of early platform drivers belonging to a certain class -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The architecture code calls early_platform_driver_probe() to match -registered early platform devices associated with a certain class with -registered early platform drivers. Matched devices will get probed(). -This step can be executed at any point during the early boot. As soon -as possible may be good for the serial port case. - -6. Inside the early platform driver probe() -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The driver code needs to take special care during early boot, especially -when it comes to memory allocation and interrupt registration. The code -in the probe() function can use is_early_platform_device() to check if -it is called at early platform device or at the regular platform device -time. The early serial driver performs register_console() at this point. - -For further information, see . diff --git a/Documentation/driver-model/porting.rst b/Documentation/driver-model/porting.rst deleted file mode 100644 index ae4bf843c1d6..000000000000 --- a/Documentation/driver-model/porting.rst +++ /dev/null @@ -1,448 +0,0 @@ -======================================= -Porting Drivers to the New Driver Model -======================================= - -Patrick Mochel - -7 January 2003 - - -Overview - -Please refer to `Documentation/driver-model/*.rst` for definitions of -various driver types and concepts. - -Most of the work of porting devices drivers to the new model happens -at the bus driver layer. This was intentional, to minimize the -negative effect on kernel drivers, and to allow a gradual transition -of bus drivers. - -In a nutshell, the driver model consists of a set of objects that can -be embedded in larger, bus-specific objects. Fields in these generic -objects can replace fields in the bus-specific objects. - -The generic objects must be registered with the driver model core. By -doing so, they will exported via the sysfs filesystem. sysfs can be -mounted by doing:: - - # mount -t sysfs sysfs /sys - - - -The Process - -Step 0: Read include/linux/device.h for object and function definitions. - -Step 1: Registering the bus driver. - - -- Define a struct bus_type for the bus driver:: - - struct bus_type pci_bus_type = { - .name = "pci", - }; - - -- Register the bus type. - - This should be done in the initialization function for the bus type, - which is usually the module_init(), or equivalent, function:: - - static int __init pci_driver_init(void) - { - return bus_register(&pci_bus_type); - } - - subsys_initcall(pci_driver_init); - - - The bus type may be unregistered (if the bus driver may be compiled - as a module) by doing:: - - bus_unregister(&pci_bus_type); - - -- Export the bus type for others to use. - - Other code may wish to reference the bus type, so declare it in a - shared header file and export the symbol. - -From include/linux/pci.h:: - - extern struct bus_type pci_bus_type; - - -From file the above code appears in:: - - EXPORT_SYMBOL(pci_bus_type); - - - -- This will cause the bus to show up in /sys/bus/pci/ with two - subdirectories: 'devices' and 'drivers':: - - # tree -d /sys/bus/pci/ - /sys/bus/pci/ - |-- devices - `-- drivers - - - -Step 2: Registering Devices. - -struct device represents a single device. It mainly contains metadata -describing the relationship the device has to other entities. - - -- Embed a struct device in the bus-specific device type:: - - - struct pci_dev { - ... - struct device dev; /* Generic device interface */ - ... - }; - - It is recommended that the generic device not be the first item in - the struct to discourage programmers from doing mindless casts - between the object types. Instead macros, or inline functions, - should be created to convert from the generic object type:: - - - #define to_pci_dev(n) container_of(n, struct pci_dev, dev) - - or - - static inline struct pci_dev * to_pci_dev(struct kobject * kobj) - { - return container_of(n, struct pci_dev, dev); - } - - This allows the compiler to verify type-safety of the operations - that are performed (which is Good). - - -- Initialize the device on registration. - - When devices are discovered or registered with the bus type, the - bus driver should initialize the generic device. The most important - things to initialize are the bus_id, parent, and bus fields. - - The bus_id is an ASCII string that contains the device's address on - the bus. The format of this string is bus-specific. This is - necessary for representing devices in sysfs. - - parent is the physical parent of the device. It is important that - the bus driver sets this field correctly. - - The driver model maintains an ordered list of devices that it uses - for power management. This list must be in order to guarantee that - devices are shutdown before their physical parents, and vice versa. - The order of this list is determined by the parent of registered - devices. - - Also, the location of the device's sysfs directory depends on a - device's parent. sysfs exports a directory structure that mirrors - the device hierarchy. Accurately setting the parent guarantees that - sysfs will accurately represent the hierarchy. - - The device's bus field is a pointer to the bus type the device - belongs to. This should be set to the bus_type that was declared - and initialized before. - - Optionally, the bus driver may set the device's name and release - fields. - - The name field is an ASCII string describing the device, like - - "ATI Technologies Inc Radeon QD" - - The release field is a callback that the driver model core calls - when the device has been removed, and all references to it have - been released. More on this in a moment. - - -- Register the device. - - Once the generic device has been initialized, it can be registered - with the driver model core by doing:: - - device_register(&dev->dev); - - It can later be unregistered by doing:: - - device_unregister(&dev->dev); - - This should happen on buses that support hotpluggable devices. - If a bus driver unregisters a device, it should not immediately free - it. It should instead wait for the driver model core to call the - device's release method, then free the bus-specific object. - (There may be other code that is currently referencing the device - structure, and it would be rude to free the device while that is - happening). - - - When the device is registered, a directory in sysfs is created. - The PCI tree in sysfs looks like:: - - /sys/devices/pci0/ - |-- 00:00.0 - |-- 00:01.0 - | `-- 01:00.0 - |-- 00:02.0 - | `-- 02:1f.0 - | `-- 03:00.0 - |-- 00:1e.0 - | `-- 04:04.0 - |-- 00:1f.0 - |-- 00:1f.1 - | |-- ide0 - | | |-- 0.0 - | | `-- 0.1 - | `-- ide1 - | `-- 1.0 - |-- 00:1f.2 - |-- 00:1f.3 - `-- 00:1f.5 - - Also, symlinks are created in the bus's 'devices' directory - that point to the device's directory in the physical hierarchy:: - - /sys/bus/pci/devices/ - |-- 00:00.0 -> ../../../devices/pci0/00:00.0 - |-- 00:01.0 -> ../../../devices/pci0/00:01.0 - |-- 00:02.0 -> ../../../devices/pci0/00:02.0 - |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0 - |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0 - |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1 - |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2 - |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3 - |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5 - |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0 - |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0 - |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0 - `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0 - - - -Step 3: Registering Drivers. - -struct device_driver is a simple driver structure that contains a set -of operations that the driver model core may call. - - -- Embed a struct device_driver in the bus-specific driver. - - Just like with devices, do something like:: - - struct pci_driver { - ... - struct device_driver driver; - }; - - -- Initialize the generic driver structure. - - When the driver registers with the bus (e.g. doing pci_register_driver()), - initialize the necessary fields of the driver: the name and bus - fields. - - -- Register the driver. - - After the generic driver has been initialized, call:: - - driver_register(&drv->driver); - - to register the driver with the core. - - When the driver is unregistered from the bus, unregister it from the - core by doing:: - - driver_unregister(&drv->driver); - - Note that this will block until all references to the driver have - gone away. Normally, there will not be any. - - -- Sysfs representation. - - Drivers are exported via sysfs in their bus's 'driver's directory. - For example:: - - /sys/bus/pci/drivers/ - |-- 3c59x - |-- Ensoniq AudioPCI - |-- agpgart-amdk7 - |-- e100 - `-- serial - - -Step 4: Define Generic Methods for Drivers. - -struct device_driver defines a set of operations that the driver model -core calls. Most of these operations are probably similar to -operations the bus already defines for drivers, but taking different -parameters. - -It would be difficult and tedious to force every driver on a bus to -simultaneously convert their drivers to generic format. Instead, the -bus driver should define single instances of the generic methods that -forward call to the bus-specific drivers. For instance:: - - - static int pci_device_remove(struct device * dev) - { - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - - if (drv) { - if (drv->remove) - drv->remove(pci_dev); - pci_dev->driver = NULL; - } - return 0; - } - - -The generic driver should be initialized with these methods before it -is registered:: - - /* initialize common driver fields */ - drv->driver.name = drv->name; - drv->driver.bus = &pci_bus_type; - drv->driver.probe = pci_device_probe; - drv->driver.resume = pci_device_resume; - drv->driver.suspend = pci_device_suspend; - drv->driver.remove = pci_device_remove; - - /* register with core */ - driver_register(&drv->driver); - - -Ideally, the bus should only initialize the fields if they are not -already set. This allows the drivers to implement their own generic -methods. - - -Step 5: Support generic driver binding. - -The model assumes that a device or driver can be dynamically -registered with the bus at any time. When registration happens, -devices must be bound to a driver, or drivers must be bound to all -devices that it supports. - -A driver typically contains a list of device IDs that it supports. The -bus driver compares these IDs to the IDs of devices registered with it. -The format of the device IDs, and the semantics for comparing them are -bus-specific, so the generic model does attempt to generalize them. - -Instead, a bus may supply a method in struct bus_type that does the -comparison:: - - int (*match)(struct device * dev, struct device_driver * drv); - -match should return positive value if the driver supports the device, -and zero otherwise. It may also return error code (for example --EPROBE_DEFER) if determining that given driver supports the device is -not possible. - -When a device is registered, the bus's list of drivers is iterated -over. bus->match() is called for each one until a match is found. - -When a driver is registered, the bus's list of devices is iterated -over. bus->match() is called for each device that is not already -claimed by a driver. - -When a device is successfully bound to a driver, device->driver is -set, the device is added to a per-driver list of devices, and a -symlink is created in the driver's sysfs directory that points to the -device's physical directory:: - - /sys/bus/pci/drivers/ - |-- 3c59x - | `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0 - |-- Ensoniq AudioPCI - |-- agpgart-amdk7 - | `-- 00:00.0 -> ../../../../devices/pci0/00:00.0 - |-- e100 - | `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0 - `-- serial - - -This driver binding should replace the existing driver binding -mechanism the bus currently uses. - - -Step 6: Supply a hotplug callback. - -Whenever a device is registered with the driver model core, the -userspace program /sbin/hotplug is called to notify userspace. -Users can define actions to perform when a device is inserted or -removed. - -The driver model core passes several arguments to userspace via -environment variables, including - -- ACTION: set to 'add' or 'remove' -- DEVPATH: set to the device's physical path in sysfs. - -A bus driver may also supply additional parameters for userspace to -consume. To do this, a bus must implement the 'hotplug' method in -struct bus_type:: - - int (*hotplug) (struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); - -This is called immediately before /sbin/hotplug is executed. - - -Step 7: Cleaning up the bus driver. - -The generic bus, device, and driver structures provide several fields -that can replace those defined privately to the bus driver. - -- Device list. - -struct bus_type contains a list of all devices registered with the bus -type. This includes all devices on all instances of that bus type. -An internal list that the bus uses may be removed, in favor of using -this one. - -The core provides an iterator to access these devices:: - - int bus_for_each_dev(struct bus_type * bus, struct device * start, - void * data, int (*fn)(struct device *, void *)); - - -- Driver list. - -struct bus_type also contains a list of all drivers registered with -it. An internal list of drivers that the bus driver maintains may -be removed in favor of using the generic one. - -The drivers may be iterated over, like devices:: - - int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, - void * data, int (*fn)(struct device_driver *, void *)); - - -Please see drivers/base/bus.c for more information. - - -- rwsem - -struct bus_type contains an rwsem that protects all core accesses to -the device and driver lists. This can be used by the bus driver -internally, and should be used when accessing the device or driver -lists the bus maintains. - - -- Device and driver fields. - -Some of the fields in struct device and struct device_driver duplicate -fields in the bus-specific representations of these objects. Feel free -to remove the bus-specific ones and favor the generic ones. Note -though, that this will likely mean fixing up all the drivers that -reference the bus-specific fields (though those should all be 1-line -changes). diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt index f388545a85a7..c07565ba57da 100644 --- a/Documentation/eisa.txt +++ b/Documentation/eisa.txt @@ -103,7 +103,7 @@ id_table an array of NULL terminated EISA id strings, (driver_data). driver a generic driver, such as described in - Documentation/driver-model/driver.rst. Only .name, + Documentation/driver-api/driver-model/driver.rst. Only .name, .probe and .remove members are mandatory. =============== ==================================================== @@ -152,7 +152,7 @@ state set of flags indicating the state of the device. Current flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED. res set of four 256 bytes I/O regions allocated to this device dma_mask DMA mask set from the parent device. -dev generic device (see Documentation/driver-model/device.rst) +dev generic device (see Documentation/driver-api/driver-model/device.rst) ======== ============================================================ You can get the 'struct eisa_device' from 'struct device' using the diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 5b5311f9358d..ddf15b1b0d5a 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of a stat(2) operation. More information can driver-model specific features can be found in -Documentation/driver-model/. +Documentation/driver-api/driver-model/. TODO: Finish this section. diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst index d5b05d3e54ba..452fc28d8e0b 100644 --- a/Documentation/hwmon/submitting-patches.rst +++ b/Documentation/hwmon/submitting-patches.rst @@ -89,7 +89,7 @@ increase the chances of your change being accepted. console. Excessive logging can seriously affect system performance. * Use devres functions whenever possible to allocate resources. For rationale - and supported functions, please see Documentation/driver-model/devres.rst. + and supported functions, please see Documentation/driver-api/driver-model/devres.rst. If a function is not supported by devres, consider using devm_add_action(). * If the driver has a detect function, make sure it is silent. Debug messages diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index 452271dda141..ee1f37da5b23 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -288,7 +288,7 @@ dev/ 包含两个子目录: char/ 和 block/。在这两个子目录中,有 中相应的设备。/sys/dev 提供一个通过一个 stat(2) 操作结果,查找 设备 sysfs 接口快捷的方法。 -更多有关 driver-model 的特性信息可以在 Documentation/driver-model/ +更多有关 driver-model 的特性信息可以在 Documentation/driver-api/driver-model/ 中找到。 -- cgit From df1b7ce784c220373d202ea9f8bc0c424f2c9f7c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 17:16:23 -0300 Subject: docs: add some documentation dirs to the driver-api book Those are subsystem docs, with a mix of kABI and user-faced docs. While they're not split, keep the dirs where they are, adding just a pointer to the main index. Signed-off-by: Mauro Carvalho Chehab --- Documentation/accounting/index.rst | 2 +- Documentation/block/index.rst | 2 +- Documentation/hid/index.rst | 2 +- Documentation/iio/index.rst | 2 +- Documentation/index.rst | 4 ++++ 5 files changed, 8 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/accounting/index.rst b/Documentation/accounting/index.rst index e1f6284b5ff3..9369d8bf32be 100644 --- a/Documentation/accounting/index.rst +++ b/Documentation/accounting/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ========== Accounting diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst index 8cd226a0e86e..3fa7a52fafa4 100644 --- a/Documentation/block/index.rst +++ b/Documentation/block/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ===== Block diff --git a/Documentation/hid/index.rst b/Documentation/hid/index.rst index af4324902622..737d66dc16a1 100644 --- a/Documentation/hid/index.rst +++ b/Documentation/hid/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ============================= Human Interface Devices (HID) diff --git a/Documentation/iio/index.rst b/Documentation/iio/index.rst index 0593dca89a94..58b7a4ebac51 100644 --- a/Documentation/iio/index.rst +++ b/Documentation/iio/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ============== Industrial I/O diff --git a/Documentation/index.rst b/Documentation/index.rst index a322c8721d13..dcdaaff71633 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -92,6 +92,10 @@ needed). driver-api/index core-api/index + accounting/index + block/index + hid/index + iio/index leds/index media/index networking/index -- cgit From 83bbf6e103544d65f17f4b2ccea1c6a51c0b0769 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 12:59:40 -0300 Subject: docs: aoe: add it to the driver-api book Those files belong to the admin guide, so add them. Signed-off-by: Mauro Carvalho Chehab Acked-by: Justin Sanders --- Documentation/admin-guide/aoe/aoe.rst | 150 ++++++++++++++++++++++++++ Documentation/admin-guide/aoe/autoload.sh | 17 +++ Documentation/admin-guide/aoe/examples.rst | 23 ++++ Documentation/admin-guide/aoe/index.rst | 17 +++ Documentation/admin-guide/aoe/status.sh | 30 ++++++ Documentation/admin-guide/aoe/todo.rst | 17 +++ Documentation/admin-guide/aoe/udev-install.sh | 33 ++++++ Documentation/admin-guide/aoe/udev.txt | 26 +++++ Documentation/admin-guide/index.rst | 1 + Documentation/aoe/aoe.rst | 150 -------------------------- Documentation/aoe/autoload.sh | 17 --- Documentation/aoe/examples.rst | 23 ---- Documentation/aoe/index.rst | 19 ---- Documentation/aoe/status.sh | 30 ------ Documentation/aoe/todo.rst | 17 --- Documentation/aoe/udev-install.sh | 33 ------ Documentation/aoe/udev.txt | 26 ----- 17 files changed, 314 insertions(+), 315 deletions(-) create mode 100644 Documentation/admin-guide/aoe/aoe.rst create mode 100644 Documentation/admin-guide/aoe/autoload.sh create mode 100644 Documentation/admin-guide/aoe/examples.rst create mode 100644 Documentation/admin-guide/aoe/index.rst create mode 100644 Documentation/admin-guide/aoe/status.sh create mode 100644 Documentation/admin-guide/aoe/todo.rst create mode 100644 Documentation/admin-guide/aoe/udev-install.sh create mode 100644 Documentation/admin-guide/aoe/udev.txt delete mode 100644 Documentation/aoe/aoe.rst delete mode 100644 Documentation/aoe/autoload.sh delete mode 100644 Documentation/aoe/examples.rst delete mode 100644 Documentation/aoe/index.rst delete mode 100644 Documentation/aoe/status.sh delete mode 100644 Documentation/aoe/todo.rst delete mode 100644 Documentation/aoe/udev-install.sh delete mode 100644 Documentation/aoe/udev.txt (limited to 'Documentation') diff --git a/Documentation/admin-guide/aoe/aoe.rst b/Documentation/admin-guide/aoe/aoe.rst new file mode 100644 index 000000000000..a05e751363a0 --- /dev/null +++ b/Documentation/admin-guide/aoe/aoe.rst @@ -0,0 +1,150 @@ +Introduction +============ + +ATA over Ethernet is a network protocol that provides simple access to +block storage on the LAN. + + http://support.coraid.com/documents/AoEr11.txt + +The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ... + + http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html + +It has many tips and hints! Please see, especially, recommended +tunings for virtual memory: + + http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19 + +The aoetools are userland programs that are designed to work with this +driver. The aoetools are on sourceforge. + + http://aoetools.sourceforge.net/ + +The scripts in this Documentation/admin-guide/aoe directory are intended to +document the use of the driver and are not necessary if you install +the aoetools. + + +Creating Device Nodes +===================== + + Users of udev should find the block device nodes created + automatically, but to create all the necessary device nodes, use the + udev configuration rules provided in udev.txt (in this directory). + + There is a udev-install.sh script that shows how to install these + rules on your system. + + There is also an autoload script that shows how to edit + /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when + necessary. Preloading the aoe module is preferable to autoloading, + however, because AoE discovery takes a few seconds. It can be + confusing when an AoE device is not present the first time the a + command is run but appears a second later. + +Using Device Nodes +================== + + "cat /dev/etherd/err" blocks, waiting for error diagnostic output, + like any retransmitted packets. + + "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to + limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from + untrusted networks should be ignored as a matter of security. See + also the aoe_iflist driver option described below. + + "echo > /dev/etherd/discover" tells the driver to find out what AoE + devices are available. + + In the future these character devices may disappear and be replaced + by sysfs counterparts. Using the commands in aoetools insulates + users from these implementation details. + + The block devices are named like this:: + + e{shelf}.{slot} + e{shelf}.{slot}p{part} + + ... so that "e0.2" is the third blade from the left (slot 2) in the + first shelf (shelf address zero). That's the whole disk. The first + partition on that disk would be "e0.2p1". + +Using sysfs +=========== + + Each aoe block device in /sys/block has the extra attributes of + state, mac, and netif. The state attribute is "up" when the device + is ready for I/O and "down" if detected but unusable. The + "down,closewait" state shows that the device is still open and + cannot come up again until it has been closed. + + The mac attribute is the ethernet address of the remote AoE device. + The netif attribute is the network interface on the localhost + through which we are communicating with the remote AoE device. + + There is a script in this directory that formats this information in + a convenient way. Users with aoetools should use the aoe-stat + command:: + + root@makki root# sh Documentation/admin-guide/aoe/status.sh + e10.0 eth3 up + e10.1 eth3 up + e10.2 eth3 up + e10.3 eth3 up + e10.4 eth3 up + e10.5 eth3 up + e10.6 eth3 up + e10.7 eth3 up + e10.8 eth3 up + e10.9 eth3 up + e4.0 eth1 up + e4.1 eth1 up + e4.2 eth1 up + e4.3 eth1 up + e4.4 eth1 up + e4.5 eth1 up + e4.6 eth1 up + e4.7 eth1 up + e4.8 eth1 up + e4.9 eth1 up + + Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver + option discussed below) instead of /dev/etherd/interfaces to limit + AoE traffic to the network interfaces in the given + whitespace-separated list. Unlike the old character device, the + sysfs entry can be read from as well as written to. + + It's helpful to trigger discovery after setting the list of allowed + interfaces. The aoetools package provides an aoe-discover script + for this purpose. You can also directly use the + /dev/etherd/discover special file described above. + +Driver Options +============== + + There is a boot option for the built-in aoe driver and a + corresponding module parameter, aoe_iflist. Without this option, + all network interfaces may be used for ATA over Ethernet. Here is a + usage example for the module parameter:: + + modprobe aoe_iflist="eth1 eth3" + + The aoe_deadsecs module parameter determines the maximum number of + seconds that the driver will wait for an AoE device to provide a + response to an AoE command. After aoe_deadsecs seconds have + elapsed, the AoE device will be marked as "down". A value of zero + is supported for testing purposes and makes the aoe driver keep + trying AoE commands forever. + + The aoe_maxout module parameter has a default of 128. This is the + maximum number of unresponded packets that will be sent to an AoE + target at one time. + + The aoe_dyndevs module parameter defaults to 1, meaning that the + driver will assign a block device minor number to a discovered AoE + target based on the order of its discovery. With dynamic minor + device numbers in use, a greater range of AoE shelf and slot + addresses can be supported. Users with udev will never have to + think about minor numbers. Using aoe_dyndevs=0 allows device nodes + to be pre-created using a static minor-number scheme with the + aoe-mkshelf script in the aoetools. diff --git a/Documentation/admin-guide/aoe/autoload.sh b/Documentation/admin-guide/aoe/autoload.sh new file mode 100644 index 000000000000..815dff4691c9 --- /dev/null +++ b/Documentation/admin-guide/aoe/autoload.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# set aoe to autoload by installing the +# aliases in /etc/modprobe.d/ + +f=/etc/modprobe.d/aoe.conf + +if test ! -r $f || test ! -w $f; then + echo "cannot configure $f for module autoloading" 1>&2 + exit 1 +fi + +grep major-152 $f >/dev/null +if [ $? = 1 ]; then + echo alias block-major-152 aoe >> $f + echo alias char-major-152 aoe >> $f +fi + diff --git a/Documentation/admin-guide/aoe/examples.rst b/Documentation/admin-guide/aoe/examples.rst new file mode 100644 index 000000000000..91f3198e52c1 --- /dev/null +++ b/Documentation/admin-guide/aoe/examples.rst @@ -0,0 +1,23 @@ +Example of udev rules +--------------------- + + .. include:: udev.txt + :literal: + +Example of udev install rules script +------------------------------------ + + .. literalinclude:: udev-install.sh + :language: shell + +Example script to get status +---------------------------- + + .. literalinclude:: status.sh + :language: shell + +Example of AoE autoload script +------------------------------ + + .. literalinclude:: autoload.sh + :language: shell diff --git a/Documentation/admin-guide/aoe/index.rst b/Documentation/admin-guide/aoe/index.rst new file mode 100644 index 000000000000..d71c5df15922 --- /dev/null +++ b/Documentation/admin-guide/aoe/index.rst @@ -0,0 +1,17 @@ +======================= +ATA over Ethernet (AoE) +======================= + +.. toctree:: + :maxdepth: 1 + + aoe + todo + examples + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/aoe/status.sh b/Documentation/admin-guide/aoe/status.sh new file mode 100644 index 000000000000..eeec7baae57a --- /dev/null +++ b/Documentation/admin-guide/aoe/status.sh @@ -0,0 +1,30 @@ +#! /bin/sh +# collate and present sysfs information about AoE storage +# +# A more complete version of this script is aoe-stat, in the +# aoetools. + +set -e +format="%8s\t%8s\t%8s\n" +me=`basename $0` +sysd=${sysfs_dir:-/sys} + +# printf "$format" device mac netif state + +# Suse 9.1 Pro doesn't put /sys in /etc/mtab +#test -z "`mount | grep sysfs`" && { +test ! -d "$sysd/block" && { + echo "$me Error: sysfs is not mounted" 1>&2 + exit 1 +} + +for d in `ls -d $sysd/block/etherd* 2>/dev/null | grep -v p` end; do + # maybe ls comes up empty, so we use "end" + test $d = end && continue + + dev=`echo "$d" | sed 's/.*!//'` + printf "$format" \ + "$dev" \ + "`cat \"$d/netif\"`" \ + "`cat \"$d/state\"`" +done | sort diff --git a/Documentation/admin-guide/aoe/todo.rst b/Documentation/admin-guide/aoe/todo.rst new file mode 100644 index 000000000000..dea8db5a33e1 --- /dev/null +++ b/Documentation/admin-guide/aoe/todo.rst @@ -0,0 +1,17 @@ +TODO +==== + +There is a potential for deadlock when allocating a struct sk_buff for +data that needs to be written out to aoe storage. If the data is +being written from a dirty page in order to free that page, and if +there are no other pages available, then deadlock may occur when a +free page is needed for the sk_buff allocation. This situation has +not been observed, but it would be nice to eliminate any potential for +deadlock under memory pressure. + +Because ATA over Ethernet is not fragmented by the kernel's IP code, +the destructor member of the struct sk_buff is available to the aoe +driver. By using a mempool for allocating all but the first few +sk_buffs, and by registering a destructor, we should be able to +efficiently allocate sk_buffs without introducing any potential for +deadlock. diff --git a/Documentation/admin-guide/aoe/udev-install.sh b/Documentation/admin-guide/aoe/udev-install.sh new file mode 100644 index 000000000000..15e86f58c036 --- /dev/null +++ b/Documentation/admin-guide/aoe/udev-install.sh @@ -0,0 +1,33 @@ +# install the aoe-specific udev rules from udev.txt into +# the system's udev configuration +# + +me="`basename $0`" + +# find udev.conf, often /etc/udev/udev.conf +# (or environment can specify where to find udev.conf) +# +if test -z "$conf"; then + if test -r /etc/udev/udev.conf; then + conf=/etc/udev/udev.conf + else + conf="`find /etc -type f -name udev.conf 2> /dev/null`" + if test -z "$conf" || test ! -r "$conf"; then + echo "$me Error: no udev.conf found" 1>&2 + exit 1 + fi + fi +fi + +# find the directory where udev rules are stored, often +# /etc/udev/rules.d +# +rules_d="`sed -n '/^udev_rules=/{ s!udev_rules=!!; s!\"!!g; p; }' $conf`" +if test -z "$rules_d" ; then + rules_d=/etc/udev/rules.d +fi +if test ! -d "$rules_d"; then + echo "$me Error: cannot find udev rules directory" 1>&2 + exit 1 +fi +sh -xc "cp `dirname $0`/udev.txt $rules_d/60-aoe.rules" diff --git a/Documentation/admin-guide/aoe/udev.txt b/Documentation/admin-guide/aoe/udev.txt new file mode 100644 index 000000000000..5fb756466bc7 --- /dev/null +++ b/Documentation/admin-guide/aoe/udev.txt @@ -0,0 +1,26 @@ +# These rules tell udev what device nodes to create for aoe support. +# They may be installed along the following lines. Check the section +# 8 udev manpage to see whether your udev supports SUBSYSTEM, and +# whether it uses one or two equal signs for SUBSYSTEM and KERNEL. +# +# ecashin@makki ~$ su +# Password: +# bash# find /etc -type f -name udev.conf +# /etc/udev/udev.conf +# bash# grep udev_rules= /etc/udev/udev.conf +# udev_rules="/etc/udev/rules.d/" +# bash# ls /etc/udev/rules.d/ +# 10-wacom.rules 50-udev.rules +# bash# cp /path/to/linux/Documentation/admin-guide/aoe/udev.txt \ +# /etc/udev/rules.d/60-aoe.rules +# + +# aoe char devices +SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220" +SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440" +SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220" +SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220" +SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220" + +# aoe block devices +KERNEL=="etherd*", GROUP="disk" diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 9228fbf5ce4e..1f0d9b939311 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -83,6 +83,7 @@ configure specific aspects of kernel behavior to your liking. namespaces/index perf-security acpi/index + aoe/index device-mapper/index laptops/index diff --git a/Documentation/aoe/aoe.rst b/Documentation/aoe/aoe.rst deleted file mode 100644 index 58747ecec71d..000000000000 --- a/Documentation/aoe/aoe.rst +++ /dev/null @@ -1,150 +0,0 @@ -Introduction -============ - -ATA over Ethernet is a network protocol that provides simple access to -block storage on the LAN. - - http://support.coraid.com/documents/AoEr11.txt - -The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ... - - http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html - -It has many tips and hints! Please see, especially, recommended -tunings for virtual memory: - - http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19 - -The aoetools are userland programs that are designed to work with this -driver. The aoetools are on sourceforge. - - http://aoetools.sourceforge.net/ - -The scripts in this Documentation/aoe directory are intended to -document the use of the driver and are not necessary if you install -the aoetools. - - -Creating Device Nodes -===================== - - Users of udev should find the block device nodes created - automatically, but to create all the necessary device nodes, use the - udev configuration rules provided in udev.txt (in this directory). - - There is a udev-install.sh script that shows how to install these - rules on your system. - - There is also an autoload script that shows how to edit - /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when - necessary. Preloading the aoe module is preferable to autoloading, - however, because AoE discovery takes a few seconds. It can be - confusing when an AoE device is not present the first time the a - command is run but appears a second later. - -Using Device Nodes -================== - - "cat /dev/etherd/err" blocks, waiting for error diagnostic output, - like any retransmitted packets. - - "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to - limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from - untrusted networks should be ignored as a matter of security. See - also the aoe_iflist driver option described below. - - "echo > /dev/etherd/discover" tells the driver to find out what AoE - devices are available. - - In the future these character devices may disappear and be replaced - by sysfs counterparts. Using the commands in aoetools insulates - users from these implementation details. - - The block devices are named like this:: - - e{shelf}.{slot} - e{shelf}.{slot}p{part} - - ... so that "e0.2" is the third blade from the left (slot 2) in the - first shelf (shelf address zero). That's the whole disk. The first - partition on that disk would be "e0.2p1". - -Using sysfs -=========== - - Each aoe block device in /sys/block has the extra attributes of - state, mac, and netif. The state attribute is "up" when the device - is ready for I/O and "down" if detected but unusable. The - "down,closewait" state shows that the device is still open and - cannot come up again until it has been closed. - - The mac attribute is the ethernet address of the remote AoE device. - The netif attribute is the network interface on the localhost - through which we are communicating with the remote AoE device. - - There is a script in this directory that formats this information in - a convenient way. Users with aoetools should use the aoe-stat - command:: - - root@makki root# sh Documentation/aoe/status.sh - e10.0 eth3 up - e10.1 eth3 up - e10.2 eth3 up - e10.3 eth3 up - e10.4 eth3 up - e10.5 eth3 up - e10.6 eth3 up - e10.7 eth3 up - e10.8 eth3 up - e10.9 eth3 up - e4.0 eth1 up - e4.1 eth1 up - e4.2 eth1 up - e4.3 eth1 up - e4.4 eth1 up - e4.5 eth1 up - e4.6 eth1 up - e4.7 eth1 up - e4.8 eth1 up - e4.9 eth1 up - - Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver - option discussed below) instead of /dev/etherd/interfaces to limit - AoE traffic to the network interfaces in the given - whitespace-separated list. Unlike the old character device, the - sysfs entry can be read from as well as written to. - - It's helpful to trigger discovery after setting the list of allowed - interfaces. The aoetools package provides an aoe-discover script - for this purpose. You can also directly use the - /dev/etherd/discover special file described above. - -Driver Options -============== - - There is a boot option for the built-in aoe driver and a - corresponding module parameter, aoe_iflist. Without this option, - all network interfaces may be used for ATA over Ethernet. Here is a - usage example for the module parameter:: - - modprobe aoe_iflist="eth1 eth3" - - The aoe_deadsecs module parameter determines the maximum number of - seconds that the driver will wait for an AoE device to provide a - response to an AoE command. After aoe_deadsecs seconds have - elapsed, the AoE device will be marked as "down". A value of zero - is supported for testing purposes and makes the aoe driver keep - trying AoE commands forever. - - The aoe_maxout module parameter has a default of 128. This is the - maximum number of unresponded packets that will be sent to an AoE - target at one time. - - The aoe_dyndevs module parameter defaults to 1, meaning that the - driver will assign a block device minor number to a discovered AoE - target based on the order of its discovery. With dynamic minor - device numbers in use, a greater range of AoE shelf and slot - addresses can be supported. Users with udev will never have to - think about minor numbers. Using aoe_dyndevs=0 allows device nodes - to be pre-created using a static minor-number scheme with the - aoe-mkshelf script in the aoetools. diff --git a/Documentation/aoe/autoload.sh b/Documentation/aoe/autoload.sh deleted file mode 100644 index 815dff4691c9..000000000000 --- a/Documentation/aoe/autoload.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -# set aoe to autoload by installing the -# aliases in /etc/modprobe.d/ - -f=/etc/modprobe.d/aoe.conf - -if test ! -r $f || test ! -w $f; then - echo "cannot configure $f for module autoloading" 1>&2 - exit 1 -fi - -grep major-152 $f >/dev/null -if [ $? = 1 ]; then - echo alias block-major-152 aoe >> $f - echo alias char-major-152 aoe >> $f -fi - diff --git a/Documentation/aoe/examples.rst b/Documentation/aoe/examples.rst deleted file mode 100644 index 91f3198e52c1..000000000000 --- a/Documentation/aoe/examples.rst +++ /dev/null @@ -1,23 +0,0 @@ -Example of udev rules ---------------------- - - .. include:: udev.txt - :literal: - -Example of udev install rules script ------------------------------------- - - .. literalinclude:: udev-install.sh - :language: shell - -Example script to get status ----------------------------- - - .. literalinclude:: status.sh - :language: shell - -Example of AoE autoload script ------------------------------- - - .. literalinclude:: autoload.sh - :language: shell diff --git a/Documentation/aoe/index.rst b/Documentation/aoe/index.rst deleted file mode 100644 index 4394b9b7913c..000000000000 --- a/Documentation/aoe/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -:orphan: - -======================= -ATA over Ethernet (AoE) -======================= - -.. toctree:: - :maxdepth: 1 - - aoe - todo - examples - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/aoe/status.sh b/Documentation/aoe/status.sh deleted file mode 100644 index eeec7baae57a..000000000000 --- a/Documentation/aoe/status.sh +++ /dev/null @@ -1,30 +0,0 @@ -#! /bin/sh -# collate and present sysfs information about AoE storage -# -# A more complete version of this script is aoe-stat, in the -# aoetools. - -set -e -format="%8s\t%8s\t%8s\n" -me=`basename $0` -sysd=${sysfs_dir:-/sys} - -# printf "$format" device mac netif state - -# Suse 9.1 Pro doesn't put /sys in /etc/mtab -#test -z "`mount | grep sysfs`" && { -test ! -d "$sysd/block" && { - echo "$me Error: sysfs is not mounted" 1>&2 - exit 1 -} - -for d in `ls -d $sysd/block/etherd* 2>/dev/null | grep -v p` end; do - # maybe ls comes up empty, so we use "end" - test $d = end && continue - - dev=`echo "$d" | sed 's/.*!//'` - printf "$format" \ - "$dev" \ - "`cat \"$d/netif\"`" \ - "`cat \"$d/state\"`" -done | sort diff --git a/Documentation/aoe/todo.rst b/Documentation/aoe/todo.rst deleted file mode 100644 index dea8db5a33e1..000000000000 --- a/Documentation/aoe/todo.rst +++ /dev/null @@ -1,17 +0,0 @@ -TODO -==== - -There is a potential for deadlock when allocating a struct sk_buff for -data that needs to be written out to aoe storage. If the data is -being written from a dirty page in order to free that page, and if -there are no other pages available, then deadlock may occur when a -free page is needed for the sk_buff allocation. This situation has -not been observed, but it would be nice to eliminate any potential for -deadlock under memory pressure. - -Because ATA over Ethernet is not fragmented by the kernel's IP code, -the destructor member of the struct sk_buff is available to the aoe -driver. By using a mempool for allocating all but the first few -sk_buffs, and by registering a destructor, we should be able to -efficiently allocate sk_buffs without introducing any potential for -deadlock. diff --git a/Documentation/aoe/udev-install.sh b/Documentation/aoe/udev-install.sh deleted file mode 100644 index 15e86f58c036..000000000000 --- a/Documentation/aoe/udev-install.sh +++ /dev/null @@ -1,33 +0,0 @@ -# install the aoe-specific udev rules from udev.txt into -# the system's udev configuration -# - -me="`basename $0`" - -# find udev.conf, often /etc/udev/udev.conf -# (or environment can specify where to find udev.conf) -# -if test -z "$conf"; then - if test -r /etc/udev/udev.conf; then - conf=/etc/udev/udev.conf - else - conf="`find /etc -type f -name udev.conf 2> /dev/null`" - if test -z "$conf" || test ! -r "$conf"; then - echo "$me Error: no udev.conf found" 1>&2 - exit 1 - fi - fi -fi - -# find the directory where udev rules are stored, often -# /etc/udev/rules.d -# -rules_d="`sed -n '/^udev_rules=/{ s!udev_rules=!!; s!\"!!g; p; }' $conf`" -if test -z "$rules_d" ; then - rules_d=/etc/udev/rules.d -fi -if test ! -d "$rules_d"; then - echo "$me Error: cannot find udev rules directory" 1>&2 - exit 1 -fi -sh -xc "cp `dirname $0`/udev.txt $rules_d/60-aoe.rules" diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt deleted file mode 100644 index 54feda5a0772..000000000000 --- a/Documentation/aoe/udev.txt +++ /dev/null @@ -1,26 +0,0 @@ -# These rules tell udev what device nodes to create for aoe support. -# They may be installed along the following lines. Check the section -# 8 udev manpage to see whether your udev supports SUBSYSTEM, and -# whether it uses one or two equal signs for SUBSYSTEM and KERNEL. -# -# ecashin@makki ~$ su -# Password: -# bash# find /etc -type f -name udev.conf -# /etc/udev/udev.conf -# bash# grep udev_rules= /etc/udev/udev.conf -# udev_rules="/etc/udev/rules.d/" -# bash# ls /etc/udev/rules.d/ -# 10-wacom.rules 50-udev.rules -# bash# cp /path/to/linux/Documentation/aoe/udev.txt \ -# /etc/udev/rules.d/60-aoe.rules -# - -# aoe char devices -SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220" -SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440" -SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220" -SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220" -SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220" - -# aoe block devices -KERNEL=="etherd*", GROUP="disk" -- cgit From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 13:08:35 -0300 Subject: docs: cgroup-v1: add it to the admin-guide book Those files belong to the admin guide, so add them. Signed-off-by: Mauro Carvalho Chehab --- .../admin-guide/cgroup-v1/blkio-controller.rst | 302 ++++++ Documentation/admin-guide/cgroup-v1/cgroups.rst | 695 ++++++++++++++ Documentation/admin-guide/cgroup-v1/cpuacct.rst | 50 + Documentation/admin-guide/cgroup-v1/cpusets.rst | 866 +++++++++++++++++ Documentation/admin-guide/cgroup-v1/devices.rst | 132 +++ .../admin-guide/cgroup-v1/freezer-subsystem.rst | 127 +++ Documentation/admin-guide/cgroup-v1/hugetlb.rst | 50 + Documentation/admin-guide/cgroup-v1/index.rst | 28 + Documentation/admin-guide/cgroup-v1/memcg_test.rst | 355 +++++++ Documentation/admin-guide/cgroup-v1/memory.rst | 1003 ++++++++++++++++++++ Documentation/admin-guide/cgroup-v1/net_cls.rst | 44 + Documentation/admin-guide/cgroup-v1/net_prio.rst | 57 ++ Documentation/admin-guide/cgroup-v1/pids.rst | 92 ++ Documentation/admin-guide/cgroup-v1/rdma.rst | 117 +++ Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 4 +- .../admin-guide/mm/numa_memory_policy.rst | 2 +- Documentation/block/bfq-iosched.rst | 2 +- Documentation/cgroup-v1/blkio-controller.rst | 302 ------ Documentation/cgroup-v1/cgroups.rst | 695 -------------- Documentation/cgroup-v1/cpuacct.rst | 50 - Documentation/cgroup-v1/cpusets.rst | 866 ----------------- Documentation/cgroup-v1/devices.rst | 132 --- Documentation/cgroup-v1/freezer-subsystem.rst | 127 --- Documentation/cgroup-v1/hugetlb.rst | 50 - Documentation/cgroup-v1/index.rst | 30 - Documentation/cgroup-v1/memcg_test.rst | 355 ------- Documentation/cgroup-v1/memory.rst | 1003 -------------------- Documentation/cgroup-v1/net_cls.rst | 44 - Documentation/cgroup-v1/net_prio.rst | 57 -- Documentation/cgroup-v1/pids.rst | 92 -- Documentation/cgroup-v1/rdma.rst | 117 --- Documentation/filesystems/tmpfs.txt | 2 +- Documentation/kernel-per-CPU-kthreads.txt | 2 +- Documentation/scheduler/sched-deadline.rst | 2 +- Documentation/scheduler/sched-design-CFS.rst | 2 +- Documentation/scheduler/sched-rt-group.rst | 2 +- Documentation/vm/numa.rst | 4 +- Documentation/vm/page_migration.rst | 2 +- Documentation/vm/unevictable-lru.rst | 2 +- Documentation/x86/x86_64/fake-numa-for-cpusets.rst | 4 +- 42 files changed, 3935 insertions(+), 3936 deletions(-) create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst delete mode 100644 Documentation/cgroup-v1/cgroups.rst delete mode 100644 Documentation/cgroup-v1/cpuacct.rst delete mode 100644 Documentation/cgroup-v1/cpusets.rst delete mode 100644 Documentation/cgroup-v1/devices.rst delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst delete mode 100644 Documentation/cgroup-v1/hugetlb.rst delete mode 100644 Documentation/cgroup-v1/index.rst delete mode 100644 Documentation/cgroup-v1/memcg_test.rst delete mode 100644 Documentation/cgroup-v1/memory.rst delete mode 100644 Documentation/cgroup-v1/net_cls.rst delete mode 100644 Documentation/cgroup-v1/net_prio.rst delete mode 100644 Documentation/cgroup-v1/pids.rst delete mode 100644 Documentation/cgroup-v1/rdma.rst (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst new file mode 100644 index 000000000000..1d7d962933be --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst @@ -0,0 +1,302 @@ +=================== +Block IO Controller +=================== + +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +One IO control policy is throttling policy which can be used to +specify upper IO rate limits on devices. This policy is implemented in +generic block layer and can be used on leaf nodes as well as higher +level logical devices like device mapper. + +HOWTO +===== +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller:: + + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer:: + + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: + + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ":: + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not:: + + # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Throttling implements hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows:: + + root + / \ + test1 test2 + | + test3 + +Throttling with "sane_behavior" will handle the +hierarchy correctly. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following:: + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_BFQ_CGROUP_DEBUG + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format:: + + # echo dev_maj:dev_minor weight > blkio.weight_device + + Configure weight=300 on /dev/sdb (8:16) in this cgroup:: + + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup:: + + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup:: + + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst new file mode 100644 index 000000000000..b0688011ed06 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst @@ -0,0 +1,695 @@ +============== +Control Groups +============== + +Written by Paul Menage based on +Documentation/admin-guide/cgroup-v1/cpusets.rst + +Original copyright statements from cpusets.txt: + +Portions Copyright (C) 2004 BULL SA. + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. + +Modified by Paul Jackson + +Modified by Christoph Lameter + +.. CONTENTS: + + 1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name + 3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API + 4. Extended attributes usage + 5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines:: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can:: + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of:: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like:: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup:: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type:: + + # mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?` you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group:: + + # mount -t tmpfs cgroup_root /sys/fs/cgroup + # mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type:: + + # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent:: + + # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent:: + + # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1:: + + # cd /sys/fs/cgroup/rg1 + # mkdir my_cgroup + +Now you want to do something with this cgroup: + + # cd my_cgroup + +In this directory you can find several files:: + + # ls + cgroup.procs notify_on_release tasks + (plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup:: + + # /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cgroup, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0:: + + # echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _cgrp_subsys + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +``int css_online(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +``void css_offline(struct cgroup *cgrp);`` +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +``void css_free(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +``void css_reset(struct cgroup_subsys_state *css)`` +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +``void fork(struct task_struct *task)`` + +Called when a task is forked into a cgroup. + +``void exit(struct task_struct *task)`` + +Called during task exit. + +``void free(struct task_struct *task)`` + +Called when the task_struct is freed. + +``void bind(struct cgroup *root)`` +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +:: + + Q: what's up with this '/bin/echo' ? + A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + + Q: When I attach processes, only the first of the line gets really attached ! + A: We can only return one error code per call to write(). So you should also + put only ONE PID. diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst new file mode 100644 index 000000000000..d30ed81d2ad7 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst @@ -0,0 +1,50 @@ +========================= +CPU Accounting Controller +========================= + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst new file mode 100644 index 000000000000..86a6ae995d54 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -0,0 +1,866 @@ +======= +CPUSETS +======= + +Copyright (C) 2004 BULL SA. + +Written by Simon.Derr@bull.net + +- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +- Modified by Paul Jackson +- Modified by Christoph Lameter +- Modified by Paul Menage +- Modified by Hidetoshi Seto + +.. CONTENTS: + + 1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes + 3. Questions + 4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/admin-guide/cgroup-v1/cgroups.rst. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example:: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> + Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + +====== =========================================================== + -1 no request. use system default or follow request of others. + 0 no search. + 1 search siblings (hyperthreads in a core). + 2 search cores in a package. + 3 search cpus in a node [= system wide on non-NUMA system] + 4 search nodes in a chunk of node [on NUMA system] + 5 search system wide [on NUMA system] +====== =========================================================== + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. + then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpuset's 'tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset:: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: + + # cd /sys/fs/cgroup/cpuset + # mkdir my_cpuset + +Now you want to do something with this cpuset:: + + # cd my_cpuset + +In this directory you can find several files:: + + # ls + cgroup.clone_children cpuset.memory_pressure + cgroup.event_control cpuset.memory_spread_page + cgroup.procs cpuset.memory_spread_slab + cpuset.cpu_exclusive cpuset.mems + cpuset.cpus cpuset.sched_load_balance + cpuset.mem_exclusive cpuset.sched_relax_domain_level + cpuset.mem_hardwall notify_on_release + cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags:: + + # /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus:: + + # /bin/echo 0-7 > cpuset.cpus + +Add some mems:: + + # /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset:: + + # /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cpuset, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command:: + + mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to:: + + mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset + echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories:: + + # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset:: + + # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs:: + + # /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple:: + + # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' + # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: + what's up with this '/bin/echo' ? + +A: + bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: + When I attach processes, only the first of the line gets really attached ! + +A: + We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst new file mode 100644 index 000000000000..e1886783961e --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/devices.rst @@ -0,0 +1,132 @@ +=========================== +Device Whitelist Controller +=========================== + +1. Description +============== + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface +================= + +An entry is added using devices.allow, and removed using +devices.deny. For instance:: + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing:: + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing:: + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security +=========== + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy +============ + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example:: + + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A:: + + # echo "c 116:* r" > A/devices.deny + +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed:: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated:: + + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding ``c *:3 rwm``:: + + # echo "c *:3 rwm" >A/devices.allow + +the result:: + + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B:: + + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow + +or even:: + + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) +--------------------------------------- + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst new file mode 100644 index 000000000000..582d3427de3f --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst @@ -0,0 +1,127 @@ +============== +Cgroup Freezer +============== + +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells:: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage:: + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem:: + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container:: + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container:: + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst new file mode 100644 index 000000000000..a3902aa253a9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -0,0 +1,50 @@ +================== +HugeTLB Controller +================== + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files:: + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting three hugepage sizes (64k, 32M and 1G), the control +files include:: + + hugetlb.1GB.limit_in_bytes + hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.usage_in_bytes + hugetlb.1GB.failcnt + hugetlb.64KB.limit_in_bytes + hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.usage_in_bytes + hugetlb.64KB.failcnt + hugetlb.32MB.limit_in_bytes + hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.usage_in_bytes + hugetlb.32MB.failcnt diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst new file mode 100644 index 000000000000..10bf48bae0b0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/index.rst @@ -0,0 +1,28 @@ +======================== +Control Groups version 1 +======================== + +.. toctree:: + :maxdepth: 1 + + cgroups + + blkio-controller + cpuacct + cpusets + devices + freezer-subsystem + hugetlb + memcg_test + memory + net_cls + net_prio + pids + rdma + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst new file mode 100644 index 000000000000..3f7115e07b5d --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst @@ -0,0 +1,355 @@ +===================================================== +Memory Resource Controller(Memcg) Implementation Memo +===================================================== + +Last Updated: 2010/2 + +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst) + +0. How to record usage ? +======================== + + 2 objects are used. + + page_cgroup ....an object per page. + + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge +========= + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge +=========== + + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel +======================= + + Memcg pages are charged in two steps: + + - mem_cgroup_try_charge() + - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous +============ + + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache +============= + + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + + Note: + __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache +=========================== + + The best way to understand shmem's page state transition is to read + mm/shmem.c. + + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration +================= + + mem_cgroup_migrate() + +8. LRU +====== + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global pgdat->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under pgdat->lru_lock. + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. +================= + + Tests for racy cases. + +9.1 Small limit to memcg. +------------------------- + + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + +9.2 Shmem +--------- + + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + +9.3 Migration +------------- + + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration:: + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset.:: + + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + +9.4 Memory hotplug +------------------ + + memory hotplug test is one of good test. + + to offline memory, do following:: + + # echo offline > /sys/devices/system/memory/memoryXXX/state + + (XXX is the place of memory) + + This is an easy way to test page migration, too. + +9.5 mkdir/rmdir +--------------- + + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following:: + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running:: + + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + +9.6 Mount with other subsystems +------------------------------- + + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example:: + + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + +9.7 swapoff +----------- + + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good: + + (Shell-A):: + + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + + Run malloc(100M) program under this. You'll see 60M of swaps. + + (Shell-B):: + + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + +9.8 OOM-Killer +-------------- + + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + + Case A) when you can swapoff:: + + #swapoff -a + #echo 50M > /memory.limit_in_bytes + + run 51M of malloc + + Case B) when you use mem+swap limitation:: + + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + + run 51M of malloc + +9.9 Move charges at task migration +---------------------------------- + + Charges associated with a task can be moved along with task migration. + + (Shell-A):: + + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B):: + + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading ``*.usage_in_bytes`` or + memory.stat of both A and B. + + See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should + be written to move_charge_at_immigrate. + +9.10 Memory thresholds +---------------------- + + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener:: + + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory:: + + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst new file mode 100644 index 000000000000..41bdc038dad9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -0,0 +1,1003 @@ +========================== +Memory Resource Controller +========================== + +NOTE: + This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: + The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller +============================================= + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + +==================================== ========================================== + tasks attach a task(thread) and show list of + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes set/show limit of memory usage + memory.memsw.limit_in_bytes set/show limit of memory+Swap usage + memory.failcnt show the number of memory usage hits limits + memory.memsw.failcnt show the number of memory+Swap hits limits + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + memory.force_empty trigger forced page reclaim + memory.pressure_level set memory pressure notifications + memory.swappiness set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate set/show controls of moving charges + memory.oom_control set/show oom controls. + memory.numa_stat show the number of memory usage per numa + node + + memory.kmem.limit_in_bytes set/show hard limit for kernel memory + memory.kmem.usage_in_bytes show current kernel memory allocation + memory.kmem.failcnt show the number of kernel memory usage + hits limits + memory.kmem.max_usage_in_bytes show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation + memory.kmem.tcp.failcnt show the number of tcp buf memory usage + hits limits + memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded +==================================== ========================================== + +1. History +========== + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control +================= + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design +----------- + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting +--------------- + +:: + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details +------------------------ + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting +-------------------------- + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) +-------------------------------------- + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +**why 'memory+swap' rather than swap** + +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +**What happens when a cgroup hits memory.memsw.limit_in_bytes** + +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim +----------- + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: + Reclaim does not work for the root cgroup, since we cannot set any + limits on the root cgroup. + +Note2: + When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking +----------- + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + the i_pages lock. + + Other lock order is following: + + PG_locked. + mm->page_table_lock + pgdat->lru_lock + lock_page_cgroup. + + In many cases, just lock_page_cgroup() is called. + + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + pgdat->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +----------------------------------------------- + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory accounting is enabled for all memory cgroups by default. But +it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel +at boot time. In this case, kernel memory will not be accounted at all. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). + +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted +----------------------------------------------- + +stack pages: + every process consumes some stack pages. By accounting into + kernel memory, we prevent new processes from being created when the kernel + memory usage is too high. + +slab pages: + pages allocated by the SLAB or SLUB allocator are tracked. A copy + of each kmem_cache is created every time the cache is touched by the first time + from inside the memcg. The creation is done lazily, so some objects can still be + skipped while the cache is being created. All objects in a slab page should + belong to the same memcg. This only fails to hold when a task is migrated to a + different memcg during the page allocation by the cache. + +sockets memory pressure: + some sockets protocols have memory pressure + thresholds. The Memory Controller allows them to be controlled individually + per cgroup, instead of globally. + +tcp memory pressure: + sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases +---------------------- + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + +U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + +U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + +WARNING: + In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + +U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface +================= + +3.0. Configuration +------------------ + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +------------------------------------------------------------------- + +:: + + # mount -t tmpfs none /sys/fs/cgroup + # mkdir /sys/fs/cgroup/memory + # mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it:: + + # mkdir /sys/fs/cgroup/memory/0 + # echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit:: + + # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: + We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, + mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, + Gibibytes.) + +NOTE: + We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. + +NOTE: + We cannot set limits on the root cgroup any more. + +:: + + # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes + 4194304 + +We can check the usage:: + + # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes + 1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel:: + + # echo 1 > memory.limit_in_bytes + # cat memory.limit_in_bytes + 4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing +========== + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting +------------------- + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration +------------------ + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup +--------------------- + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces +=================== + +5.1 force_empty +--------------- + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this:: + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Though rmdir() offlines memcg, but the memcg may still stay there due to + charged file caches. Some out-of-use page caches may keep charged until + memory pressure happens. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file +------------- + +memory.stat file includes following statistics + +per-memory cgroup local status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +=============== =============================================================== +cache # of bytes of page cache memory. +rss # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge # of bytes of anonymous transparent hugepages. +mapped_file # of bytes of mapped file (includes tmpfs/shmem) +pgpgin # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap # of bytes of swap usage +dirty # of bytes that are waiting to get written back to the disk. +writeback # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file # of bytes of file-backed memory on inactive LRU list. +active_file # of bytes of file-backed memory on active LRU list. +unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). +=============== =============================================================== + +status considering hierarchy (see memory.use_hierarchy settings) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= =================================================== +hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache +========================= =================================================== + +The following additional stats are dependent on CONFIG_DEBUG_VM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= ======================================== +recent_rotated_anon VM internal parameter. (see mm/vmscan.c) +recent_rotated_file VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon VM internal parameter. (see mm/vmscan.c) +recent_scanned_file VM internal parameter. (see mm/vmscan.c) +========================= ======================================== + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + + 'rss + mapped_file" will give you resident set size of cgroup. + + (Note: file and shmem may be shared among other cgroups. In that case, + mapped_file is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness +-------------- + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt +----------- + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file:: + + # echo 0 > .../memory.failcnt + +5.5 usage_in_bytes +------------------ + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat +------------- + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is:: + + total= N0= N1= ... + file= N0= N1= ... + anon= N0= N1= ... + unevictable= N0= N1= ... + hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support +==================== + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy:: + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim +------------------------------------------------ + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: + + # echo 1 > memory.use_hierarchy + +The feature can be disabled by:: + + # echo 0 > memory.use_hierarchy + +NOTE1: + Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: + When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits +============== + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface +------------- + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB):: + + # echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use:: + + # echo 1G > memory.soft_limit_in_bytes + +NOTE1: + Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: + It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration +================================= + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface +------------- + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it:: + + # echo (some positive value) > memory.move_charge_at_immigrate + +Note: + Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: + Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: + If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: + It can take several seconds if you move charges much. + +And if you want disable it again:: + + # echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved +-------------------------------------- + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + ++---+--------------------------------------------------------------------------+ +|bit| what type of charges would be moved ? | ++===+==========================================================================+ +| 0 | A charge of an anonymous page (or swap of it) used by the target task. | +| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | ++---+--------------------------------------------------------------------------+ +| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | +| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | +| | anonymous pages, file pages (and swaps) in the range mmapped by the task | +| | will be moved even if the task hasn't done page fault, i.e. they might | +| | not be the task's "RSS", but other task's "RSS" that maps the same file. | +| | And mapcount of the page is ignored (the page can be moved even if | +| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | +| | enable move of swap charges. | ++---+--------------------------------------------------------------------------+ + +8.3 TODO +-------- + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds +==================== + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: + +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control +=============== + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + + * enlarge limit or reduce usage. + +To reduce usage, + + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + + - oom_kill_disable 0 or 1 + (if 1, oom-killer is disabled) + - under_oom 0 or 1 + (if 1, the memory cgroup is under OOM, tasks may be stopped.) + +11. Memory Pressure +=================== + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +By default, events are propagated upward until the event is handled, i.e. the +events are not pass-through. For example, you have three cgroups: A->B->C. Now +you set up an event listener on cgroups A, B and C, and suppose group C +experiences some pressure. In this situation, only group C will receive the +notification, i.e. groups A and B will not receive it. This is done to avoid +excessive "broadcasting" of messages, which disturbs the system and which is +especially bad if we are low on memory or thrashing. Group B, will receive +notification only if there are no event listers for group C. + +There are three optional modes that specify different propagation behavior: + + - "default": this is the default behavior specified above. This mode is the + same as omitting the optional mode parameter, preserved by backwards + compatibility. + + - "hierarchy": events always propagate up to the root, similar to the default + behavior, except that propagation continues regardless of whether there are + event listeners at each level, with the "hierarchy" mode. In the above + example, groups A, B, and C will receive notification of memory pressure. + + - "local": events are pass-through, i.e. they only receive notifications when + memory pressure is experienced in the memcg for which the notification is + registered. In the above example, group C will receive notification if + registered for "local" notification and the group experiences memory + pressure. However, group B will never receive notification, regardless if + there is an event listener for group C or not, if group B is registered for + local notification. + +The level and event notification mode ("hierarchy" or "local", if necessary) are +specified by a comma-delimited string, i.e. "low,hierarchy" specifies +hierarchical, pass-through, notification for all ancestor memcgs. Notification +that is the default, non pass-through behavior, does not specify a mode. +"medium,local" specifies pass-through notification for the medium level. + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string as " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure:: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low,hierarchy & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO +======== + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary +======= + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References +========== + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst new file mode 100644 index 000000000000..a2cf272af7a0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst @@ -0,0 +1,44 @@ +========================= +Network classifier cgroup +========================= + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example:: + + mkdir /sys/fs/cgroup/net_cls + mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls + mkdir /sys/fs/cgroup/net_cls/0 + echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + +- setting a 10:1 handle:: + + cat /sys/fs/cgroup/net_cls/0/net_cls.classid + 1048577 + +- configuring tc:: + + tc qdisc add dev eth0 root handle 10: htb + tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + +- creating traffic class 10:1:: + + tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example:: + + iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst new file mode 100644 index 000000000000..b40905871c64 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst @@ -0,0 +1,57 @@ +======================= +Network priority cgroup +======================= + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx + This file is read-only, and is simply informative. It contains a unique + integer value that the kernel uses as an internal representation of this + cgroup. + +net_prio.ifpriomap + This file contains a map of the priorities assigned to traffic originating + from processes in this group and egressing the system on various interfaces. + It contains a list of tuples in the form . Contents of this + file can be modified by echoing a string into the file using the same tuple + format. For example:: + + echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst new file mode 100644 index 000000000000..6acebd9e72c8 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/pids.rst @@ -0,0 +1,92 @@ +========================= +Process Number Controller +========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +The pids.events file contains event counters: + + - max: Number of times fork failed because limit was hit. + +Example +------- + +First, we mount the pids controller:: + + # mkdir -p /sys/fs/cgroup/pids + # mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it:: + + # mkdir -p /sys/fs/cgroup/pids/parent/child + # echo 2 > /sys/fs/cgroup/pids/parent/pids.max + # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail:: + + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's):: + + # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.max + max + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current):: + + # echo 1 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # echo 0 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst new file mode 100644 index 000000000000..2fcb0a9bf790 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/rdma.rst @@ -0,0 +1,117 @@ +=============== +RDMA Controller +=============== + +.. Contents + + 1. Overview + 1-1. What is RDMA controller? + 1-2. Why RDMA controller needed? + 1-3. How is RDMA controller implemented? + 2. Usage Examples + +1. Overview +=========== + +1-1. What is RDMA controller? +----------------------------- + +RDMA controller allows user to limit RDMA/IB specific resources that a given +set of processes can use. These processes are grouped using RDMA controller. + +RDMA controller defines two resources which can be limited for processes of a +cgroup. + +1-2. Why RDMA controller needed? +-------------------------------- + +Currently user space applications can easily take away all the rdma verb +specific resources such as AH, CQ, QP, MR etc. Due to which other applications +in other cgroup or kernel space ULPs may not even get chance to allocate any +rdma resources. This can lead to service unavailability. + +Therefore RDMA controller is needed through which resource consumption +of processes can be limited. Through this controller different rdma +resources can be accounted. + +1-3. How is RDMA controller implemented? +---------------------------------------- + +RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains +resource accounting per cgroup, per device using resource pool structure. +Each such resource pool is limited up to 64 resources in given resource pool +by rdma cgroup, which can be extended later if required. + +This resource pool object is linked to the cgroup css. Typically there +are 0 to 4 resource pool instances per cgroup, per device in most use cases. +But nothing limits to have it more. At present hundreds of RDMA devices per +single cgroup may not be handled optimally, however there is no +known use case or requirement for such configuration either. + +Since RDMA resources can be allocated from any process and can be freed by any +of the child processes which shares the address space, rdma resources are +always owned by the creator cgroup css. This allows process migration from one +to other cgroup without major complexity of transferring resource ownership; +because such ownership is not really present due to shared nature of +rdma resources. Linking resources around css also ensures that cgroups can be +deleted after processes migrated. This allow progress migration as well with +active resources, even though that is not a primary use case. + +Whenever RDMA resource charging occurs, owner rdma cgroup is returned to +the caller. Same rdma cgroup should be passed while uncharging the resource. +This also allows process migrated with active RDMA resource to charge +to new owner cgroup for new resource. It also allows to uncharge resource of +a process from previously charged cgroup which is migrated to new cgroup, +even though that is not a primary use case. + +Resource pool object is created in following situations. +(a) User sets the limit and no previous resource pool exist for the device +of interest for the cgroup. +(b) No resource limits were configured, but IB/RDMA stack tries to +charge the resource. So that it correctly uncharge them when applications are +running without limits and later on when limits are enforced during uncharging, +otherwise usage count will drop to negative. + +Resource pool is destroyed if all the resource limits are set to max and +it is the last resource getting deallocated. + +User should set all the limit to max value if it intents to remove/unconfigure +the resource pool for a particular device. + +IB stack honors limits enforced by the rdma controller. When application +query about maximum resource limits of IB device, it returns minimum of +what is configured by user for a given cgroup and what is supported by +IB device. + +Following resources can be accounted by rdma controller. + + ========== ============================= + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + ========== ============================= + +2. Usage Examples +================= + +(a) Configure resource limit:: + + echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max + echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max + +(b) Query resource limit:: + + cat /sys/fs/cgroup/rdma/2/rdma.max + #Output: + mlx4_0 hca_handle=2 hca_object=2000 + ocrdma1 hca_handle=3 hca_object=max + +(c) Query current usage:: + + cat /sys/fs/cgroup/rdma/2/rdma.current + #Output: + mlx4_0 hca_handle=1 hca_object=20 + ocrdma1 hca_handle=1 hca_object=23 + +(d) Delete resource limit:: + + echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 080b18ce2a5d..ed4c5977d6e1 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and conventions of cgroup v2. It describes all userland-visible aspects of cgroup including core and specific controller behaviors. All future changes must be reflected in this document. Documentation for -v1 is available under Documentation/cgroup-v1/. +v1 is available under Documentation/admin-guide/cgroup-v1/. .. CONTENTS diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 1f0d9b939311..a5fdb1a846ce 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking. initrd cgroup-v2 + cgroup-v1/index serial-console braille-console parport diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 78576aa45cce..a571a67e0c85 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4089,7 +4089,7 @@ relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. - See Documentation/cgroup-v1/cpusets.rst. + See Documentation/admin-guide/cgroup-v1/cpusets.rst. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] @@ -4599,7 +4599,7 @@ swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/cgroup-v1/memory.rst) + it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { | force | noforce } diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 546f174e5d6a..8463f5538fda 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy support. Memory policies should not be confused with cpusets -(``Documentation/cgroup-v1/cpusets.rst``) +(``Documentation/admin-guide/cgroup-v1/cpusets.rst``) which is an administrative mechanism for restricting the nodes from which memory may be allocated by a set of processes. Memory policies are a programming interface that a NUMA-aware application can take advantage of. When diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst index 2c13b2fc1888..0d237d402860 100644 --- a/Documentation/block/bfq-iosched.rst +++ b/Documentation/block/bfq-iosched.rst @@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all the stat files documented in -Documentation/cgroup-v1/blkio-controller.rst. If, instead, +Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead, CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files:: blkio.bfq.io_service_bytes diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst deleted file mode 100644 index 1d7d962933be..000000000000 --- a/Documentation/cgroup-v1/blkio-controller.rst +++ /dev/null @@ -1,302 +0,0 @@ -=================== -Block IO Controller -=================== - -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -One IO control policy is throttling policy which can be used to -specify upper IO rate limits on devices. This policy is implemented in -generic block layer and can be used on leaf nodes as well as higher -level logical devices like device mapper. - -HOWTO -===== -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller:: - - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer:: - - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: - - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ":: - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not:: - - # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Throttling implements hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows:: - - root - / \ - test1 test2 - | - test3 - -Throttling with "sane_behavior" will handle the -hierarchy correctly. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following:: - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_BFQ_CGROUP_DEBUG - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format:: - - # echo dev_maj:dev_minor weight > blkio.weight_device - - Configure weight=300 on /dev/sdb (8:16) in this cgroup:: - - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup:: - - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup:: - - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst deleted file mode 100644 index 46bbe7e022d4..000000000000 --- a/Documentation/cgroup-v1/cgroups.rst +++ /dev/null @@ -1,695 +0,0 @@ -============== -Control Groups -============== - -Written by Paul Menage based on -Documentation/cgroup-v1/cpusets.rst - -Original copyright statements from cpusets.txt: - -Portions Copyright (C) 2004 BULL SA. - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. - -Modified by Paul Jackson - -Modified by Christoph Lameter - -.. CONTENTS: - - 1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name - 3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API - 4. Extended attributes usage - 5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines:: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can:: - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of:: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like:: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup:: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type:: - - # mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?` you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group:: - - # mount -t tmpfs cgroup_root /sys/fs/cgroup - # mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type:: - - # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent:: - - # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent:: - - # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1:: - - # cd /sys/fs/cgroup/rg1 - # mkdir my_cgroup - -Now you want to do something with this cgroup: - - # cd my_cgroup - -In this directory you can find several files:: - - # ls - cgroup.procs notify_on_release tasks - (plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup:: - - # /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cgroup, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0:: - - # echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _cgrp_subsys - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -``int css_online(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -``void css_offline(struct cgroup *cgrp);`` -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -``void css_free(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -``void css_reset(struct cgroup_subsys_state *css)`` -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -``void fork(struct task_struct *task)`` - -Called when a task is forked into a cgroup. - -``void exit(struct task_struct *task)`` - -Called during task exit. - -``void free(struct task_struct *task)`` - -Called when the task_struct is freed. - -``void bind(struct cgroup *root)`` -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -:: - - Q: what's up with this '/bin/echo' ? - A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - - Q: When I attach processes, only the first of the line gets really attached ! - A: We can only return one error code per call to write(). So you should also - put only ONE PID. diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst deleted file mode 100644 index d30ed81d2ad7..000000000000 --- a/Documentation/cgroup-v1/cpuacct.rst +++ /dev/null @@ -1,50 +0,0 @@ -========================= -CPU Accounting Controller -========================= - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst deleted file mode 100644 index b6a42cdea72b..000000000000 --- a/Documentation/cgroup-v1/cpusets.rst +++ /dev/null @@ -1,866 +0,0 @@ -======= -CPUSETS -======= - -Copyright (C) 2004 BULL SA. - -Written by Simon.Derr@bull.net - -- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -- Modified by Paul Jackson -- Modified by Christoph Lameter -- Modified by Paul Menage -- Modified by Hidetoshi Seto - -.. CONTENTS: - - 1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes - 3. Questions - 4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroup-v1/cgroups.rst. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example:: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> - Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - -====== =========================================================== - -1 no request. use system default or follow request of others. - 0 no search. - 1 search siblings (hyperthreads in a core). - 2 search cores in a package. - 3 search cpus in a node [= system wide on non-NUMA system] - 4 search nodes in a chunk of node [on NUMA system] - 5 search system wide [on NUMA system] -====== =========================================================== - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. - then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpuset's 'tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset:: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: - - # cd /sys/fs/cgroup/cpuset - # mkdir my_cpuset - -Now you want to do something with this cpuset:: - - # cd my_cpuset - -In this directory you can find several files:: - - # ls - cgroup.clone_children cpuset.memory_pressure - cgroup.event_control cpuset.memory_spread_page - cgroup.procs cpuset.memory_spread_slab - cpuset.cpu_exclusive cpuset.mems - cpuset.cpus cpuset.sched_load_balance - cpuset.mem_exclusive cpuset.sched_relax_domain_level - cpuset.mem_hardwall notify_on_release - cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags:: - - # /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus:: - - # /bin/echo 0-7 > cpuset.cpus - -Add some mems:: - - # /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset:: - - # /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cpuset, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command:: - - mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to:: - - mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset - echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories:: - - # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset:: - - # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs:: - - # /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple:: - - # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' - # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: - what's up with this '/bin/echo' ? - -A: - bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: - When I attach processes, only the first of the line gets really attached ! - -A: - We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst deleted file mode 100644 index e1886783961e..000000000000 --- a/Documentation/cgroup-v1/devices.rst +++ /dev/null @@ -1,132 +0,0 @@ -=========================== -Device Whitelist Controller -=========================== - -1. Description -============== - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface -================= - -An entry is added using devices.allow, and removed using -devices.deny. For instance:: - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing:: - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing:: - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security -=========== - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy -============ - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example:: - - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A:: - - # echo "c 116:* r" > A/devices.deny - -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed:: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated:: - - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding ``c *:3 rwm``:: - - # echo "c *:3 rwm" >A/devices.allow - -the result:: - - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B:: - - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow - -or even:: - - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) ---------------------------------------- - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst deleted file mode 100644 index 582d3427de3f..000000000000 --- a/Documentation/cgroup-v1/freezer-subsystem.rst +++ /dev/null @@ -1,127 +0,0 @@ -============== -Cgroup Freezer -============== - -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells:: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage:: - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem:: - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container:: - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container:: - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst deleted file mode 100644 index a3902aa253a9..000000000000 --- a/Documentation/cgroup-v1/hugetlb.rst +++ /dev/null @@ -1,50 +0,0 @@ -================== -HugeTLB Controller -================== - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files:: - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting three hugepage sizes (64k, 32M and 1G), the control -files include:: - - hugetlb.1GB.limit_in_bytes - hugetlb.1GB.max_usage_in_bytes - hugetlb.1GB.usage_in_bytes - hugetlb.1GB.failcnt - hugetlb.64KB.limit_in_bytes - hugetlb.64KB.max_usage_in_bytes - hugetlb.64KB.usage_in_bytes - hugetlb.64KB.failcnt - hugetlb.32MB.limit_in_bytes - hugetlb.32MB.max_usage_in_bytes - hugetlb.32MB.usage_in_bytes - hugetlb.32MB.failcnt diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst deleted file mode 100644 index fe76d42edc11..000000000000 --- a/Documentation/cgroup-v1/index.rst +++ /dev/null @@ -1,30 +0,0 @@ -:orphan: - -======================== -Control Groups version 1 -======================== - -.. toctree:: - :maxdepth: 1 - - cgroups - - blkio-controller - cpuacct - cpusets - devices - freezer-subsystem - hugetlb - memcg_test - memory - net_cls - net_prio - pids - rdma - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst deleted file mode 100644 index 91bd18c6a514..000000000000 --- a/Documentation/cgroup-v1/memcg_test.rst +++ /dev/null @@ -1,355 +0,0 @@ -===================================================== -Memory Resource Controller(Memcg) Implementation Memo -===================================================== - -Last Updated: 2010/2 - -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroup-v1/memory.rst) - -0. How to record usage ? -======================== - - 2 objects are used. - - page_cgroup ....an object per page. - - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge -========= - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge -=========== - - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel -======================= - - Memcg pages are charged in two steps: - - - mem_cgroup_try_charge() - - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous -============ - - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache -============= - - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - - Note: - __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache -=========================== - - The best way to understand shmem's page state transition is to read - mm/shmem.c. - - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration -================= - - mem_cgroup_migrate() - -8. LRU -====== - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global pgdat->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under pgdat->lru_lock. - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. -================= - - Tests for racy cases. - -9.1 Small limit to memcg. -------------------------- - - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - -9.2 Shmem ---------- - - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - -9.3 Migration -------------- - - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration:: - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset.:: - - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - -9.4 Memory hotplug ------------------- - - memory hotplug test is one of good test. - - to offline memory, do following:: - - # echo offline > /sys/devices/system/memory/memoryXXX/state - - (XXX is the place of memory) - - This is an easy way to test page migration, too. - -9.5 mkdir/rmdir ---------------- - - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following:: - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running:: - - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - -9.6 Mount with other subsystems -------------------------------- - - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example:: - - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - -9.7 swapoff ------------ - - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good: - - (Shell-A):: - - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - - Run malloc(100M) program under this. You'll see 60M of swaps. - - (Shell-B):: - - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - -9.8 OOM-Killer --------------- - - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - - Case A) when you can swapoff:: - - #swapoff -a - #echo 50M > /memory.limit_in_bytes - - run 51M of malloc - - Case B) when you use mem+swap limitation:: - - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - - run 51M of malloc - -9.9 Move charges at task migration ----------------------------------- - - Charges associated with a task can be moved along with task migration. - - (Shell-A):: - - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B):: - - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading ``*.usage_in_bytes`` or - memory.stat of both A and B. - - See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should - be written to move_charge_at_immigrate. - -9.10 Memory thresholds ----------------------- - - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener:: - - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory:: - - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst deleted file mode 100644 index 41bdc038dad9..000000000000 --- a/Documentation/cgroup-v1/memory.rst +++ /dev/null @@ -1,1003 +0,0 @@ -========================== -Memory Resource Controller -========================== - -NOTE: - This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: - The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller -============================================= - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - -==================================== ========================================== - tasks attach a task(thread) and show list of - threads - cgroup.procs show list of processes - cgroup.event_control an interface for event_fd() - memory.usage_in_bytes show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes set/show limit of memory usage - memory.memsw.limit_in_bytes set/show limit of memory+Swap usage - memory.failcnt show the number of memory usage hits limits - memory.memsw.failcnt show the number of memory+Swap hits limits - memory.max_usage_in_bytes show max memory usage recorded - memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded - memory.soft_limit_in_bytes set/show soft limit of memory usage - memory.stat show various statistics - memory.use_hierarchy set/show hierarchical account enabled - memory.force_empty trigger forced page reclaim - memory.pressure_level set memory pressure notifications - memory.swappiness set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate set/show controls of moving charges - memory.oom_control set/show oom controls. - memory.numa_stat show the number of memory usage per numa - node - - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - memory.kmem.usage_in_bytes show current kernel memory allocation - memory.kmem.failcnt show the number of kernel memory usage - hits limits - memory.kmem.max_usage_in_bytes show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation - memory.kmem.tcp.failcnt show the number of tcp buf memory usage - hits limits - memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded -==================================== ========================================== - -1. History -========== - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control -================= - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design ------------ - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting ---------------- - -:: - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details ------------------------- - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting --------------------------- - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) --------------------------------------- - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -**why 'memory+swap' rather than swap** - -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -**What happens when a cgroup hits memory.memsw.limit_in_bytes** - -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim ------------ - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: - Reclaim does not work for the root cgroup, since we cannot set any - limits on the root cgroup. - -Note2: - When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking ------------ - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - the i_pages lock. - - Other lock order is following: - - PG_locked. - mm->page_table_lock - pgdat->lru_lock - lock_page_cgroup. - - In many cases, just lock_page_cgroup() is called. - - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - pgdat->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) ------------------------------------------------ - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). - -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted ------------------------------------------------ - -stack pages: - every process consumes some stack pages. By accounting into - kernel memory, we prevent new processes from being created when the kernel - memory usage is too high. - -slab pages: - pages allocated by the SLAB or SLUB allocator are tracked. A copy - of each kmem_cache is created every time the cache is touched by the first time - from inside the memcg. The creation is done lazily, so some objects can still be - skipped while the cache is being created. All objects in a slab page should - belong to the same memcg. This only fails to hold when a task is migrated to a - different memcg during the page allocation by the cache. - -sockets memory pressure: - some sockets protocols have memory pressure - thresholds. The Memory Controller allows them to be controlled individually - per cgroup, instead of globally. - -tcp memory pressure: - sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases ----------------------- - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - -U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - -U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - -WARNING: - In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - -U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface -================= - -3.0. Configuration ------------------- - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -------------------------------------------------------------------- - -:: - - # mount -t tmpfs none /sys/fs/cgroup - # mkdir /sys/fs/cgroup/memory - # mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it:: - - # mkdir /sys/fs/cgroup/memory/0 - # echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit:: - - # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: - We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, - mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, - Gibibytes.) - -NOTE: - We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. - -NOTE: - We cannot set limits on the root cgroup any more. - -:: - - # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes - 4194304 - -We can check the usage:: - - # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes - 1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel:: - - # echo 1 > memory.limit_in_bytes - # cat memory.limit_in_bytes - 4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing -========== - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting -------------------- - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration ------------------- - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup ---------------------- - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces -=================== - -5.1 force_empty ---------------- - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this:: - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Though rmdir() offlines memcg, but the memcg may still stay there due to - charged file caches. Some out-of-use page caches may keep charged until - memory pressure happens. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file -------------- - -memory.stat file includes following statistics - -per-memory cgroup local status -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -=============== =============================================================== -cache # of bytes of page cache memory. -rss # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge # of bytes of anonymous transparent hugepages. -mapped_file # of bytes of mapped file (includes tmpfs/shmem) -pgpgin # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap # of bytes of swap usage -dirty # of bytes that are waiting to get written back to the disk. -writeback # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file # of bytes of file-backed memory on inactive LRU list. -active_file # of bytes of file-backed memory on active LRU list. -unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). -=============== =============================================================== - -status considering hierarchy (see memory.use_hierarchy settings) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= =================================================== -hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache -========================= =================================================== - -The following additional stats are dependent on CONFIG_DEBUG_VM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= ======================================== -recent_rotated_anon VM internal parameter. (see mm/vmscan.c) -recent_rotated_file VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon VM internal parameter. (see mm/vmscan.c) -recent_scanned_file VM internal parameter. (see mm/vmscan.c) -========================= ======================================== - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - - 'rss + mapped_file" will give you resident set size of cgroup. - - (Note: file and shmem may be shared among other cgroups. In that case, - mapped_file is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness --------------- - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt ------------ - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file:: - - # echo 0 > .../memory.failcnt - -5.5 usage_in_bytes ------------------- - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat -------------- - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is:: - - total= N0= N1= ... - file= N0= N1= ... - anon= N0= N1= ... - unevictable= N0= N1= ... - hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support -==================== - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy:: - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim ------------------------------------------------- - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: - - # echo 1 > memory.use_hierarchy - -The feature can be disabled by:: - - # echo 0 > memory.use_hierarchy - -NOTE1: - Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: - When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits -============== - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface -------------- - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB):: - - # echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use:: - - # echo 1G > memory.soft_limit_in_bytes - -NOTE1: - Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: - It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration -================================= - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface -------------- - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it:: - - # echo (some positive value) > memory.move_charge_at_immigrate - -Note: - Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: - Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: - If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: - It can take several seconds if you move charges much. - -And if you want disable it again:: - - # echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved --------------------------------------- - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - -+---+--------------------------------------------------------------------------+ -|bit| what type of charges would be moved ? | -+===+==========================================================================+ -| 0 | A charge of an anonymous page (or swap of it) used by the target task. | -| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | -+---+--------------------------------------------------------------------------+ -| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | -| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | -| | anonymous pages, file pages (and swaps) in the range mmapped by the task | -| | will be moved even if the task hasn't done page fault, i.e. they might | -| | not be the task's "RSS", but other task's "RSS" that maps the same file. | -| | And mapcount of the page is ignored (the page can be moved even if | -| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | -| | enable move of swap charges. | -+---+--------------------------------------------------------------------------+ - -8.3 TODO --------- - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds -==================== - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: - -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control -=============== - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - - * enlarge limit or reduce usage. - -To reduce usage, - - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - - - oom_kill_disable 0 or 1 - (if 1, oom-killer is disabled) - - under_oom 0 or 1 - (if 1, the memory cgroup is under OOM, tasks may be stopped.) - -11. Memory Pressure -=================== - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -By default, events are propagated upward until the event is handled, i.e. the -events are not pass-through. For example, you have three cgroups: A->B->C. Now -you set up an event listener on cgroups A, B and C, and suppose group C -experiences some pressure. In this situation, only group C will receive the -notification, i.e. groups A and B will not receive it. This is done to avoid -excessive "broadcasting" of messages, which disturbs the system and which is -especially bad if we are low on memory or thrashing. Group B, will receive -notification only if there are no event listers for group C. - -There are three optional modes that specify different propagation behavior: - - - "default": this is the default behavior specified above. This mode is the - same as omitting the optional mode parameter, preserved by backwards - compatibility. - - - "hierarchy": events always propagate up to the root, similar to the default - behavior, except that propagation continues regardless of whether there are - event listeners at each level, with the "hierarchy" mode. In the above - example, groups A, B, and C will receive notification of memory pressure. - - - "local": events are pass-through, i.e. they only receive notifications when - memory pressure is experienced in the memcg for which the notification is - registered. In the above example, group C will receive notification if - registered for "local" notification and the group experiences memory - pressure. However, group B will never receive notification, regardless if - there is an event listener for group C or not, if group B is registered for - local notification. - -The level and event notification mode ("hierarchy" or "local", if necessary) are -specified by a comma-delimited string, i.e. "low,hierarchy" specifies -hierarchical, pass-through, notification for all ancestor memcgs. Notification -that is the default, non pass-through behavior, does not specify a mode. -"medium,local" specifies pass-through notification for the medium level. - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string as " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure:: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low,hierarchy & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO -======== - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary -======= - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References -========== - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst deleted file mode 100644 index a2cf272af7a0..000000000000 --- a/Documentation/cgroup-v1/net_cls.rst +++ /dev/null @@ -1,44 +0,0 @@ -========================= -Network classifier cgroup -========================= - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example:: - - mkdir /sys/fs/cgroup/net_cls - mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls - mkdir /sys/fs/cgroup/net_cls/0 - echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - -- setting a 10:1 handle:: - - cat /sys/fs/cgroup/net_cls/0/net_cls.classid - 1048577 - -- configuring tc:: - - tc qdisc add dev eth0 root handle 10: htb - tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - -- creating traffic class 10:1:: - - tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example:: - - iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst deleted file mode 100644 index b40905871c64..000000000000 --- a/Documentation/cgroup-v1/net_prio.rst +++ /dev/null @@ -1,57 +0,0 @@ -======================= -Network priority cgroup -======================= - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx - This file is read-only, and is simply informative. It contains a unique - integer value that the kernel uses as an internal representation of this - cgroup. - -net_prio.ifpriomap - This file contains a map of the priorities assigned to traffic originating - from processes in this group and egressing the system on various interfaces. - It contains a list of tuples in the form . Contents of this - file can be modified by echoing a string into the file using the same tuple - format. For example:: - - echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst deleted file mode 100644 index 6acebd9e72c8..000000000000 --- a/Documentation/cgroup-v1/pids.rst +++ /dev/null @@ -1,92 +0,0 @@ -========================= -Process Number Controller -========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -The pids.events file contains event counters: - - - max: Number of times fork failed because limit was hit. - -Example -------- - -First, we mount the pids controller:: - - # mkdir -p /sys/fs/cgroup/pids - # mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it:: - - # mkdir -p /sys/fs/cgroup/pids/parent/child - # echo 2 > /sys/fs/cgroup/pids/parent/pids.max - # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail:: - - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's):: - - # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.max - max - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current):: - - # echo 1 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # echo 0 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst deleted file mode 100644 index 2fcb0a9bf790..000000000000 --- a/Documentation/cgroup-v1/rdma.rst +++ /dev/null @@ -1,117 +0,0 @@ -=============== -RDMA Controller -=============== - -.. Contents - - 1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? - 2. Usage Examples - -1. Overview -=========== - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can lead to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - - ========== ============================= - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - ========== ============================= - -2. Usage Examples -================= - -(a) Configure resource limit:: - - echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max - echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit:: - - cat /sys/fs/cgroup/rdma/2/rdma.max - #Output: - mlx4_0 hca_handle=2 hca_object=2000 - ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage:: - - cat /sys/fs/cgroup/rdma/2/rdma.current - #Output: - mlx4_0 hca_handle=1 hca_object=20 - ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit:: - - echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index cad797a8a39e..5ecbc03e6b2f 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed +[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 5623b9916411..4f18456dd3b1 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -12,7 +12,7 @@ References - Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. -- Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - man taskset: Using the taskset command to bind tasks to sets of CPUs. diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst index 3391e86d810c..14a2f7bf63fe 100644 --- a/Documentation/scheduler/sched-deadline.rst +++ b/Documentation/scheduler/sched-deadline.rst @@ -669,7 +669,7 @@ Deadline Task Scheduling -deadline tasks cannot have an affinity mask smaller that the entire root_domain they are created on. However, affinities can be specified - through the cpuset facility (Documentation/cgroup-v1/cpusets.rst). + through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst). 5.1 SCHED_DEADLINE and cpusets HOWTO ------------------------------------ diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index 53b30d1967cf..a96c72651877 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -222,7 +222,7 @@ SCHED_BATCH) tasks. These options need CONFIG_CGROUPS to be defined, and let the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroup-v1/cgroups.rst for more information about this filesystem. + Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem. When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index d27d3f3712fd..655a096ec8fb 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "/cpu.rt_runtime_us" to control the CPU time reserved for each control group. For more information on working with control groups, you should read -Documentation/cgroup-v1/cgroups.rst as well. +Documentation/admin-guide/cgroup-v1/cgroups.rst as well. Group settings are checked against the following limits in order to keep the configuration schedulable: diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 130f3cfa1c19..99fdeca917ca 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst @@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells' physical memory. NUMA emluation is useful for testing NUMA kernel and application features on non-NUMA platforms, and as a sort of memory resource management mechanism when used together with cpusets. -[see Documentation/cgroup-v1/cpusets.rst] +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] For each node with memory, Linux constructs an independent memory management subsystem, complete with its own free page lists, in-use page lists, usage @@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see System administrators can restrict the CPUs and nodes' memories that a non- privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst] +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] On architectures that do not hide memoryless nodes, Linux will include only zones [nodes] with memory in the zonelists. This means that for a memoryless diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 35bba27d5fff..1d6cd7db4e43 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -41,7 +41,7 @@ locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to move pages when a task is moved to another cpuset (See -Documentation/cgroup-v1/cpusets.rst). +Documentation/admin-guide/cgroup-v1/cpusets.rst). Cpusets allows the automation of process locality. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically. Also the pages diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index 109052215bce..17d0861b0f1d 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst @@ -98,7 +98,7 @@ Memory Control Group Interaction -------------------------------- The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/cgroup-v1/memory.rst] by extending the +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the lru_list enum. The memory controller data structure automatically gets a per-zone unevictable diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 30108684ae87..ff9bcfd2cc14 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the amount of system memory that are available to a certain class of tasks. For more information on the features of cpusets, see -Documentation/cgroup-v1/cpusets.rst. +Documentation/admin-guide/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. @@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:: On node 3 totalpages: 131072 Now following the instructions for mounting the cpusets filesystem from -Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory +Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory address spaces) to individual cpusets:: [root@xroads /]# mkdir exampleset -- cgit From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 14:56:51 -0300 Subject: docs: admin-guide: add a series of orphaned documents There are lots of documents that belong to the admin-guide but are on random places (most under Documentation root dir). Move them to the admin guide. Signed-off-by: Mauro Carvalho Chehab Acked-by: Alexandre Belloni Acked-by: Bartlomiej Zolnierkiewicz --- Documentation/ABI/stable/sysfs-devices-node | 2 +- Documentation/ABI/testing/procfs-diskstats | 2 +- Documentation/ABI/testing/sysfs-block | 2 +- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 +- Documentation/admin-guide/btmrvl.rst | 124 +++++++ Documentation/admin-guide/clearing-warn-once.rst | 9 + Documentation/admin-guide/cpu-load.rst | 114 +++++++ Documentation/admin-guide/cputopology.rst | 177 ++++++++++ .../admin-guide/device-mapper/statistics.rst | 4 +- Documentation/admin-guide/efi-stub.rst | 100 ++++++ Documentation/admin-guide/highuid.rst | 80 +++++ Documentation/admin-guide/hw-vuln/l1tf.rst | 2 +- Documentation/admin-guide/hw_random.rst | 105 ++++++ Documentation/admin-guide/index.rst | 17 + Documentation/admin-guide/iostats.rst | 197 ++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 2 +- .../admin-guide/kernel-per-CPU-kthreads.rst | 356 +++++++++++++++++++++ Documentation/admin-guide/lcd-panel-cgram.rst | 27 ++ Documentation/admin-guide/ldm.rst | 121 +++++++ Documentation/admin-guide/lockup-watchdogs.rst | 83 +++++ Documentation/admin-guide/mm/cma_debugfs.rst | 25 ++ Documentation/admin-guide/mm/index.rst | 1 + Documentation/admin-guide/numastat.rst | 30 ++ Documentation/admin-guide/pnp.rst | 292 +++++++++++++++++ Documentation/admin-guide/rtc.rst | 140 ++++++++ Documentation/admin-guide/svga.rst | 249 ++++++++++++++ Documentation/admin-guide/sysctl/kernel.rst | 2 +- Documentation/admin-guide/video-output.rst | 34 ++ Documentation/auxdisplay/lcd-panel-cgram.rst | 29 -- Documentation/btmrvl.txt | 124 ------- Documentation/clearing-warn-once.txt | 9 - Documentation/cma/debugfs.rst | 27 -- Documentation/cpu-load.txt | 114 ------- Documentation/cputopology.txt | 177 ---------- Documentation/efi-stub.txt | 100 ------ Documentation/fb/vesafb.rst | 2 +- Documentation/highuid.txt | 80 ----- Documentation/hw_random.txt | 105 ------ Documentation/iostats.txt | 197 ------------ Documentation/kernel-per-CPU-kthreads.txt | 356 --------------------- Documentation/ldm.txt | 121 ------- Documentation/lockup-watchdogs.txt | 83 ----- Documentation/numastat.txt | 30 -- Documentation/pnp.txt | 292 ----------------- Documentation/rtc.txt | 140 -------- Documentation/svga.txt | 249 -------------- Documentation/video-output.txt | 34 -- Documentation/x86/topology.rst | 2 +- 48 files changed, 2293 insertions(+), 2279 deletions(-) create mode 100644 Documentation/admin-guide/btmrvl.rst create mode 100644 Documentation/admin-guide/clearing-warn-once.rst create mode 100644 Documentation/admin-guide/cpu-load.rst create mode 100644 Documentation/admin-guide/cputopology.rst create mode 100644 Documentation/admin-guide/efi-stub.rst create mode 100644 Documentation/admin-guide/highuid.rst create mode 100644 Documentation/admin-guide/hw_random.rst create mode 100644 Documentation/admin-guide/iostats.rst create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst create mode 100644 Documentation/admin-guide/ldm.rst create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst create mode 100644 Documentation/admin-guide/numastat.rst create mode 100644 Documentation/admin-guide/pnp.rst create mode 100644 Documentation/admin-guide/rtc.rst create mode 100644 Documentation/admin-guide/svga.rst create mode 100644 Documentation/admin-guide/video-output.rst delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst delete mode 100644 Documentation/btmrvl.txt delete mode 100644 Documentation/clearing-warn-once.txt delete mode 100644 Documentation/cma/debugfs.rst delete mode 100644 Documentation/cpu-load.txt delete mode 100644 Documentation/cputopology.txt delete mode 100644 Documentation/efi-stub.txt delete mode 100644 Documentation/highuid.txt delete mode 100644 Documentation/hw_random.txt delete mode 100644 Documentation/iostats.txt delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt delete mode 100644 Documentation/ldm.txt delete mode 100644 Documentation/lockup-watchdogs.txt delete mode 100644 Documentation/numastat.txt delete mode 100644 Documentation/pnp.txt delete mode 100644 Documentation/rtc.txt delete mode 100644 Documentation/svga.txt delete mode 100644 Documentation/video-output.txt (limited to 'Documentation') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index f7ce68fbd4b9..df8413cf1468 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -61,7 +61,7 @@ Date: October 2002 Contact: Linux Memory Management list Description: The node's hit/miss statistics, in units of pages. - See Documentation/numastat.txt + See Documentation/admin-guide/numastat.rst What: /sys/devices/system/node/nodeX/distance Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index abac31d216de..2c44b4f1b060 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats @@ -29,4 +29,4 @@ Description: 17 - sectors discarded 18 - time spent discarding - For more details refer to Documentation/iostats.txt + For more details refer to Documentation/admin-guide/iostats.rst diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index dfad7427817c..f8c7c7126bb1 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -15,7 +15,7 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) - For more details refer Documentation/iostats.txt + For more details refer Documentation/admin-guide/iostats.rst What: /sys/block///stat diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index d404603c6b52..5f7d7b14fa44 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -34,7 +34,7 @@ Description: CPU topology files that describe kernel limits related to present: cpus that have been identified as being present in the system. - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/probe @@ -103,7 +103,7 @@ Description: CPU topology files that describe a logical CPU's relationship thread_siblings_list: human-readable list of cpu#'s hardware threads within the same core as cpu# - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/cpuidle/current_driver diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst new file mode 100644 index 000000000000..ec57740ead0c --- /dev/null +++ b/Documentation/admin-guide/btmrvl.rst @@ -0,0 +1,124 @@ +============= +btmrvl driver +============= + +All commands are used via debugfs interface. + +Set/get driver configurations +============================= + +Path: /debug/btmrvl/config/ + +gpiogap=[n], hscfgcmd + These commands are used to configure the host sleep parameters:: + bit 8:0 -- Gap + bit 16:8 -- GPIO + + where GPIO is the pin number of GPIO used to wake up the host. + It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface + wakeup will be used instead). + + where Gap is the gap in milli seconds between wakeup signal and + wakeup event, or 0xff for special host sleep setting. + + Usage:: + + # Use SDIO interface to wake up the host and set GAP to 0x80: + echo 0xff80 > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + + # Use GPIO pin #3 to wake up the host and set GAP to 0xff: + echo 0x03ff > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + +psmode=[n], pscmd + These commands are used to enable/disable auto sleep mode + + where the option is:: + + 1 -- Enable auto sleep mode + 0 -- Disable auto sleep mode + + Usage:: + + # Enable auto sleep mode + echo 1 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + # Disable auto sleep mode + echo 0 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + +hsmode=[n], hscmd + These commands are used to enable host sleep or wake up firmware + + where the option is:: + + 1 -- Enable host sleep + 0 -- Wake up firmware + + Usage:: + + # Enable host sleep + echo 1 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + # Wake up firmware + echo 0 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + +Get driver status +================= + +Path: /debug/btmrvl/status/ + +Usage:: + + cat /debug/btmrvl/status/ + +where the args are: + +curpsmode + This command displays current auto sleep status. + +psstate + This command display the power save state. + +hsstate + This command display the host sleep state. + +txdnldrdy + This command displays the value of Tx download ready flag. + +Issuing a raw hci command +========================= + +Use hcitool to issue raw hci command, refer to hcitool manual + +Usage:: + + Hcitool cmd [Parameters] + +Interface Control Command:: + + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface + +SD8688 firmware +=============== + +Images: + +- /lib/firmware/sd8688_helper.bin +- /lib/firmware/sd8688.bin + + +The images can be downloaded from: + +git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst new file mode 100644 index 000000000000..211fd926cf00 --- /dev/null +++ b/Documentation/admin-guide/clearing-warn-once.rst @@ -0,0 +1,9 @@ +Clearing WARN_ONCE +------------------ + +WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. + +echo 1 > /sys/kernel/debug/clear_warn_once + +clears the state and allows the warnings to print once again. +This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst new file mode 100644 index 000000000000..2d01ce43d2a2 --- /dev/null +++ b/Documentation/admin-guide/cpu-load.rst @@ -0,0 +1,114 @@ +======== +CPU load +======== + +Linux exports various bits of information via ``/proc/stat`` and +``/proc/uptime`` that userland tools, such as top(1), use to calculate +the average time system spent in a particular state, for example:: + + $ iostat + Linux 2.6.18.3-exp (linmac) 02/20/2007 + + avg-cpu: %user %nice %system %iowait %steal %idle + 10.01 0.00 2.92 5.44 0.00 81.63 + + ... + +Here the system thinks that over the default sampling period the +system spent 10.01% of the time doing work in user space, 2.92% in the +kernel, and was overall 81.63% of the time idle. + +In most cases the ``/proc/stat`` information reflects the reality quite +closely, however due to the nature of how/when the kernel collects +this data sometimes it can not be trusted at all. + +So how is this information collected? Whenever timer interrupt is +signalled the kernel looks what kind of task was running at this +moment and increments the counter that corresponds to this tasks +kind/state. The problem with this is that the system could have +switched between various states multiple times between two timer +interrupts yet the counter is incremented only for the last state. + + +Example +------- + +If we imagine the system with one task that periodically burns cycles +in the following manner:: + + time line between two timer interrupts + |--------------------------------------| + ^ ^ + |_ something begins working | + |_ something goes to sleep + (only to be awaken quite soon) + +In the above situation the system will be 0% loaded according to the +``/proc/stat`` (since the timer interrupt will always happen when the +system is executing the idle handler), but in reality the load is +closer to 99%. + +One can imagine many more situations where this behavior of the kernel +will lead to quite erratic information inside ``/proc/stat``:: + + + /* gcc -o hog smallhog.c */ + #include + #include + #include + #include + #define HIST 10 + + static volatile sig_atomic_t stop; + + static void sighandler (int signr) + { + (void) signr; + stop = 1; + } + static unsigned long hog (unsigned long niters) + { + stop = 0; + while (!stop && --niters); + return niters; + } + int main (void) + { + int i; + struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, + .it_value = { .tv_sec = 0, .tv_usec = 1 } }; + sigset_t set; + unsigned long v[HIST]; + double tmp = 0.0; + unsigned long n; + signal (SIGALRM, &sighandler); + setitimer (ITIMER_REAL, &it, NULL); + + hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) tmp += v[i]; + tmp /= HIST; + n = tmp - (tmp / 3.0); + + sigemptyset (&set); + sigaddset (&set, SIGALRM); + + for (;;) { + hog (n); + sigwait (&set, &i); + } + return 0; + } + + +References +---------- + +- http://lkml.org/lkml/2007/2/12/6 +- Documentation/filesystems/proc.txt (1.8) + + +Thanks +------ + +Con Kolivas, Pavel Machek diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst new file mode 100644 index 000000000000..b90dafcc8237 --- /dev/null +++ b/Documentation/admin-guide/cputopology.rst @@ -0,0 +1,177 @@ +=========================================== +How CPU topology info is exported via sysfs +=========================================== + +Export CPU topology info via sysfs. Items (attributes) are similar +to /proc/cpuinfo output of some architectures. They reside in +/sys/devices/system/cpu/cpuX/topology/: + +physical_package_id: + + physical package id of cpuX. Typically corresponds to a physical + socket number, but the actual value is architecture and platform + dependent. + +die_id: + + the CPU die ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_id: + + the CPU core ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +book_id: + + the book ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +drawer_id: + + the drawer ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_cpus: + + internal kernel map of CPUs within the same core. + (deprecated name: "thread_siblings") + +core_cpus_list: + + human-readable list of CPUs within the same core. + (deprecated name: "thread_siblings_list"); + +package_cpus: + + internal kernel map of the CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings") + +package_cpus_list: + + human-readable list of CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings_list") + +die_cpus: + + internal kernel map of CPUs within the same die. + +die_cpus_list: + + human-readable list of CPUs within the same die. + +book_siblings: + + internal kernel map of cpuX's hardware threads within the same + book_id. + +book_siblings_list: + + human-readable list of cpuX's hardware threads within the same + book_id. + +drawer_siblings: + + internal kernel map of cpuX's hardware threads within the same + drawer_id. + +drawer_siblings_list: + + human-readable list of cpuX's hardware threads within the same + drawer_id. + +Architecture-neutral, drivers/base/topology.c, exports these attributes. +However, the book and drawer related sysfs files will only be created if +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. + +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, +where they reflect the cpu and cache hierarchy. + +For an architecture to support this feature, it must define some of +these macros in include/asm-XXX/topology.h:: + + #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) + #define topology_core_id(cpu) + #define topology_book_id(cpu) + #define topology_drawer_id(cpu) + #define topology_sibling_cpumask(cpu) + #define topology_core_cpumask(cpu) + #define topology_die_cpumask(cpu) + #define topology_book_cpumask(cpu) + #define topology_drawer_cpumask(cpu) + +The type of ``**_id macros`` is int. +The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter +correspond with appropriate ``**_siblings`` sysfs attributes (except for +topology_sibling_cpumask() which corresponds with thread_siblings). + +To be consistent on all architectures, include/linux/topology.h +provides default definitions for any of the above macros that are +not defined by include/asm-XXX/topology.h: + +1) topology_physical_package_id: -1 +2) topology_die_id: -1 +3) topology_core_id: 0 +4) topology_sibling_cpumask: just the given CPU +5) topology_core_cpumask: just the given CPU +6) topology_die_cpumask: just the given CPU + +For architectures that don't support books (CONFIG_SCHED_BOOK) there are no +default definitions for topology_book_id() and topology_book_cpumask(). +For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are +no default definitions for topology_drawer_id() and topology_drawer_cpumask(). + +Additionally, CPU topology information is provided under +/sys/devices/system/cpu and includes these files. The internal +source for the output is in brackets ("[]"). + + =========== ========================================================== + kernel_max: the maximum CPU index allowed by the kernel configuration. + [NR_CPUS-1] + + offline: CPUs that are not online because they have been + HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit + of CPUs allowed by the kernel configuration (kernel_max + above). [~cpu_online_mask + cpus >= NR_CPUS] + + online: CPUs that are online and being scheduled [cpu_online_mask] + + possible: CPUs that have been allocated resources and can be + brought online if they are present. [cpu_possible_mask] + + present: CPUs that have been identified as being present in the + system. [cpu_present_mask] + =========== ========================================================== + +The format for the above output is compatible with cpulist_parse() +[see ]. Some examples follow. + +In this example, there are 64 CPUs in the system but cpus 32-63 exceed +the kernel max which is limited to 0..31 by the NR_CPUS config option +being 32. Note also that CPUs 2 and 4-31 are not online but could be +brought online as they are both present and possible:: + + kernel_max: 31 + offline: 2,4-31,32-63 + online: 0-1,3 + possible: 0-31 + present: 0-31 + +In this example, the NR_CPUS config option is 128, but the kernel was +started with possible_cpus=144. There are 4 CPUs in the system and cpu2 +was manually taken offline (and is the only CPU that can be brought +online.):: + + kernel_max: 127 + offline: 2,4-127,128-143 + online: 0-1,3 + possible: 0-127 + present: 0-3 + +See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter +as well as more information on the various cpumasks. diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst index 3d80a9f850cc..41ded0bc5933 100644 --- a/Documentation/admin-guide/device-mapper/statistics.rst +++ b/Documentation/admin-guide/device-mapper/statistics.rst @@ -13,7 +13,7 @@ the range specified. The I/O statistics counters for each step-sized area of a region are in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are +Documentation/admin-guide/iostats.rst). But two extra counters (12 and 13) are provided: total time spent reading and writing. When the histogram argument is used, the 14th parameter is reported that represents the histogram of latencies. All these counters may be accessed by sending @@ -151,7 +151,7 @@ Messages The first 11 counters have the same meaning as `/sys/block/*/stat or /proc/diskstats`. - Please refer to Documentation/iostats.txt for details. + Please refer to Documentation/admin-guide/iostats.rst for details. 1. the number of reads completed 2. the number of reads merged diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst new file mode 100644 index 000000000000..833edb0d0bc4 --- /dev/null +++ b/Documentation/admin-guide/efi-stub.rst @@ -0,0 +1,100 @@ +================= +The EFI Boot Stub +================= + +On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade +as a PE/COFF image, thereby convincing EFI firmware loaders to load +it as an EFI executable. The code that modifies the bzImage header, +along with the EFI-specific entry point that the firmware loader +jumps to are collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. For ARM the EFI stub is implemented in +arch/arm/boot/compressed/efi-header.S and +arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared +between architectures is in drivers/firmware/efi/libstub. + +For arm64, there is no compressed kernel support, so the Image itself +masquerades as a PE/COFF image and the EFI stub is linked into the +kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S +and drivers/firmware/efi/libstub/arm64-stub.c. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +How to install bzImage.efi +-------------------------- + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partition (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. For ARM the +arch/arm/boot/zImage should be copied to the system partition, and it +may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image +should be copied but not necessarily renamed. + + +Passing kernel parameters from the EFI shell +-------------------------------------------- + +Arguments to the kernel can be passed after bzImage.efi, e.g.:: + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +The "initrd=" option +-------------------- + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout:: + + fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used:: + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. + + +The "dtb=" option +----------------- + +For the ARM and arm64 architectures, a device tree must be provided to +the kernel. Normally firmware shall supply the device tree via the +EFI CONFIGURATION TABLE. However, the "dtb=" command line option can +be used to override the firmware supplied device tree, or to supply +one when firmware is unable to. + +Please note: Firmware adds runtime configuration information to the +device tree before booting the kernel. If dtb= is used to override +the device tree, then any runtime data provided by firmware will be +lost. The dtb= option should only be used either as a debug tool, or +as a last resort when a device tree is not provided in the EFI +CONFIGURATION TABLE. + +"dtb=" is processed in the same manner as the "initrd=" option that is +described above. diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst new file mode 100644 index 000000000000..6ee70465c0ea --- /dev/null +++ b/Documentation/admin-guide/highuid.rst @@ -0,0 +1,80 @@ +=================================================== +Notes on the change from 16-bit UIDs to 32-bit UIDs +=================================================== + +:Author: Chris Wing +:Last updated: January 11, 2000 + +- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t + when communicating between user and kernel space in an ioctl or data + structure. + +- kernel code should use uid_t and gid_t in kernel-private structures and + code. + +What's left to be done for 32-bit UIDs on all Linux architectures: + +- Disk quotas have an interesting limitation that is not related to the + maximum UID/GID. They are limited by the maximum file size on the + underlying filesystem, because quota records are written at offsets + corresponding to the UID in question. + Further investigation is needed to see if the quota system can cope + properly with huge UIDs. If it can deal with 64-bit file offsets on all + architectures, this should not be a problem. + +- Decide whether or not to keep backwards compatibility with the system + accounting file, or if we should break it as the comments suggest + (currently, the old 16-bit UID and GID are still written to disk, and + part of the former pad space is used to store separate 32-bit UID and + GID) + +- Need to validate that OS emulation calls the 16-bit UID + compatibility syscalls, if the OS being emulated used 16-bit UIDs, or + uses the 32-bit UID system calls properly otherwise. + + This affects at least: + + - iBCS on Intel + + - sparc32 emulation on sparc64 + (need to support whatever new 32-bit UID system calls are added to + sparc32) + +- Validate that all filesystems behave properly. + + At present, 32-bit UIDs _should_ work for: + + - ext2 + - ufs + - isofs + - nfs + - coda + - udf + + Ioctl() fixups have been made for: + + - ncpfs + - smbfs + + Filesystems with simple fixups to prevent 16-bit UID wraparound: + + - minix + - sysv + - qnx4 + + Other filesystems have not been checked yet. + +- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in + all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but + more are needed. (as well as new user<->kernel data structures) + +- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, + sh, and sparc32. Fixing this is probably not that important, but would + require adding a new ELF section. + +- The ioctl()s used to control the in-kernel NFS server only support + 16-bit UIDs on arm, i386, m68k, sh, and sparc32. + +- make sure that the UID mapping feature of AX25 networking works properly + (it should be safe because it's always used a 32-bit integer to + communicate between user and kernel) diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index 656aee262e23..f83212fae4d5 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -241,7 +241,7 @@ Guest mitigation mechanisms For further information about confining guests to a single or to a group of cores consult the cpusets documentation: - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst + https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst .. _interrupt_isolation: diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst new file mode 100644 index 000000000000..121de96e395e --- /dev/null +++ b/Documentation/admin-guide/hw_random.rst @@ -0,0 +1,105 @@ +========================================================== +Linux support for random number generator in i8xx chipsets +========================================================== + +Introduction +============ + +The hw_random framework is software that makes use of a +special hardware feature on your CPU or motherboard, +a Random Number Generator (RNG). The software has two parts: +a core providing the /dev/hwrng character device and its +sysfs support, plus a hardware-specific driver that plugs +into that core. + +To make the most effective use of these mechanisms, you +should download the support software as well. Download the +latest version of the "rng-tools" package from the +hw_random driver's official Web site: + + http://sourceforge.net/projects/gkernel/ + +Those tools use /dev/hwrng to fill the kernel entropy pool, +which is used internally and exported by the /dev/urandom and +/dev/random special files. + +Theory of operation +=================== + +CHARACTER DEVICE. Using the standard open() +and read() system calls, you can read random data from +the hardware RNG device. This data is NOT CHECKED by any +fitness tests, and could potentially be bogus (if the +hardware is faulty or has been tampered with). Data is only +output if the hardware "has-data" flag is set, but nevertheless +a security-conscious person would run fitness tests on the +data before assuming it is truly random. + +The rng-tools package uses such tests in "rngd", and lets you +run them by hand with a "rngtest" utility. + +/dev/hwrng is char device major 10, minor 183. + +CLASS DEVICE. There is a /sys/class/misc/hw_random node with +two unique attributes, "rng_available" and "rng_current". The +"rng_available" attribute lists the hardware-specific drivers +available, while "rng_current" lists the one which is currently +connected to /dev/hwrng. If your system has more than one +RNG available, you may change the one used by writing a name from +the list in "rng_available" into "rng_current". + +========================================================================== + + +Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) + - Copyright 2000,2001 Jeff Garzik + - Copyright 2000,2001 Philipp Rumpf + + +About the Intel RNG hardware, from the firmware hub datasheet +============================================================= + +The Firmware Hub integrates a Random Number Generator (RNG) +using thermal noise generated from inherently random quantum +mechanical properties of silicon. When not generating new random +bits the RNG circuitry will enter a low power state. Intel will +provide a binary software driver to give third party software +access to our RNG for use as a security feature. At this time, +the RNG is only to be used with a system in an OS-present state. + +Intel RNG Driver notes +====================== + +FIXME: support poll(2) + +.. note:: + + request_mem_region was removed, for three reasons: + + 1) Only one RNG is supported by this driver; + 2) The location used by the RNG is a fixed location in + MMIO-addressable memory; + 3) users with properly working BIOS e820 handling will always + have the region in which the RNG is located reserved, so + request_mem_region calls always fail for proper setups. + However, for people who use mem=XX, BIOS e820 information is + **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can + succeed. + +Driver details +============== + +Based on: + Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet + May 1999 Order Number: 290658-002 R + +Intel 82802 Firmware Hub: + Random Number Generator + Programmer's Reference Manual + December 1999 Order Number: 298029-001 R + +Intel 82802 Firmware HUB Random Number Generator Driver + Copyright (c) 2000 Matt Sottek + +Special thanks to Matt Sottek. I did the "guts", he +did the "brains" and all the testing. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index a5fdb1a846ce..4e98f5596da0 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking. perf-security acpi/index aoe/index + btmrvl + clearing-warn-once + cpu-load + cputopology device-mapper/index + efi-stub + highuid + hw_random + iostats + kernel-per-CPU-kthreads laptops/index + lcd-panel-cgram + ldm + lockup-watchdogs + numastat + pnp + rtc + svga + video-output .. only:: subproject and html diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst new file mode 100644 index 000000000000..5d63b18bd6d1 --- /dev/null +++ b/Documentation/admin-guide/iostats.rst @@ -0,0 +1,197 @@ +===================== +I/O statistics fields +===================== + +Since 2.4.20 (and some versions before, with patches), and 2.5.45, +more extensive disk statistics have been introduced to help measure disk +activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do +the work for you, but in case you are interested in creating your own +tools, the fields are explained here. + +In 2.4 now, the information is found as additional fields in +``/proc/partitions``. In 2.6 and upper, the same information is found in two +places: one is in the file ``/proc/diskstats``, and the other is within +the sysfs file system, which must be mounted in order to obtain +the information. Throughout this document we'll assume that sysfs +is mounted on ``/sys``, although of course it may be mounted anywhere. +Both ``/proc/diskstats`` and sysfs use the same source for the information +and so should not differ. + +Here are examples of these different formats:: + + 2.4: + 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 + + 2.6+ sysfs: + 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 35486 38030 38030 38030 + + 2.6+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 hda1 35486 38030 38030 38030 + + 4.18+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 + +On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have +a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. + +The advantage of one over the other is that the sysfs choice works well +if you are watching a known, small set of disks. ``/proc/diskstats`` may +be a better choice if you are watching a large number of disks because +you'll avoid the overhead of 50, 100, or 500 or more opens/closes with +each snapshot of your disk statistics. + +In 2.4, the statistics fields are those after the device name. In +the above example, the first field of statistics would be 446216. +By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll +find just the eleven fields, beginning with 446216. If you look at +``/proc/diskstats``, the eleven fields will be preceded by the major and +minor device numbers, and device name. Each of these formats provides +eleven fields of statistics, each meaning exactly the same things. +All fields except field 9 are cumulative since boot. Field 9 should +go to zero as I/Os complete; all others only increase (unless they +overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long +(native word size) numbers, and on a very busy or long-lived system they +may wrap. Applications should be prepared to deal with that; unless +your observations are measured in large numbers of minutes or hours, +they should not wrap twice before you notice them. + +Each set of stats only applies to the indicated device; if you want +system-wide stats you'll have to find all the devices and sum them all up. + +Field 1 -- # of reads completed + This is the total number of reads completed successfully. + +Field 2 -- # of reads merged, field 6 -- # of writes merged + Reads and writes which are adjacent to each other may be merged for + efficiency. Thus two 4K reads may become one 8K read before it is + ultimately handed to the disk, and so it will be counted (and queued) + as only one I/O. This field lets you know how often this was done. + +Field 3 -- # of sectors read + This is the total number of sectors read successfully. + +Field 4 -- # of milliseconds spent reading + This is the total number of milliseconds spent by all reads (as + measured from __make_request() to end_that_request_last()). + +Field 5 -- # of writes completed + This is the total number of writes completed successfully. + +Field 6 -- # of writes merged + See the description of field 2. + +Field 7 -- # of sectors written + This is the total number of sectors written successfully. + +Field 8 -- # of milliseconds spent writing + This is the total number of milliseconds spent by all writes (as + measured from __make_request() to end_that_request_last()). + +Field 9 -- # of I/Os currently in progress + The only field that should go to zero. Incremented as requests are + given to appropriate struct request_queue and decremented as they finish. + +Field 10 -- # of milliseconds spent doing I/Os + This field increases so long as field 9 is nonzero. + + Since 5.0 this field counts jiffies when at least one request was + started or completed. If request runs more than 2 jiffies then some + I/O time will not be accounted unless there are other requests. + +Field 11 -- weighted # of milliseconds spent doing I/Os + This field is incremented at each I/O start, I/O completion, I/O + merge, or read of these stats by the number of I/Os in progress + (field 9) times the number of milliseconds spent doing I/O since the + last update of this field. This can provide an easy measure of both + I/O completion time and the backlog that may be accumulating. + +Field 12 -- # of discards completed + This is the total number of discards completed successfully. + +Field 13 -- # of discards merged + See the description of field 2 + +Field 14 -- # of sectors discarded + This is the total number of sectors discarded successfully. + +Field 15 -- # of milliseconds spent discarding + This is the total number of milliseconds spent by all discards (as + measured from __make_request() to end_that_request_last()). + +To avoid introducing performance bottlenecks, no locks are held while +modifying these counters. This implies that minor inaccuracies may be +introduced when changes collide, so (for instance) adding up all the +read I/Os issued per partition should equal those made to the disks ... +but due to the lack of locking it may only be very close. + +In 2.6+, there are counters for each CPU, which make the lack of locking +almost a non-issue. When the statistics are read, the per-CPU counters +are summed (possibly overflowing the unsigned long variable they are +summed to) and the result given to the user. There is no convenient +user interface for accessing the per-CPU counters themselves. + +Disks vs Partitions +------------------- + +There were significant changes between 2.4 and 2.6+ in the I/O subsystem. +As a result, some statistic information disappeared. The translation from +a disk address relative to a partition to the disk address relative to +the host disk happens much earlier. All merges and timings now happen +at the disk level rather than at both the disk and partition level as +in 2.4. Consequently, you'll see a different statistics output on 2.6+ for +partitions from that for disks. There are only *four* fields available +for partitions on 2.6+ machines. This is reflected in the examples above. + +Field 1 -- # of reads issued + This is the total number of reads issued to this partition. + +Field 2 -- # of sectors read + This is the total number of sectors requested to be read from this + partition. + +Field 3 -- # of writes issued + This is the total number of writes issued to this partition. + +Field 4 -- # of sectors written + This is the total number of sectors requested to be written to + this partition. + +Note that since the address is translated to a disk-relative one, and no +record of the partition-relative address is kept, the subsequent success +or failure of the read cannot be attributed to the partition. In other +words, the number of reads for partitions is counted slightly before time +of queuing for partitions, and at completion for whole disks. This is +a subtle distinction that is probably uninteresting for most cases. + +More significant is the error induced by counting the numbers of +reads/writes before merges for partitions and after for disks. Since a +typical workload usually contains a lot of successive and adjacent requests, +the number of reads/writes issued can be several times higher than the +number of reads/writes completed. + +In 2.6.25, the full statistic set is again available for partitions and +disk and partition statistics are consistent again. Since we still don't +keep record of the partition-relative address, an operation is attributed to +the partition which contains the first sector of the request after the +eventual merges. As requests can be merged across partition, this could lead +to some (probably insignificant) inaccuracy. + +Additional notes +---------------- + +In 2.6+, sysfs is not mounted by default. If your distribution of +Linux hasn't added it already, here's the line you'll want to add to +your ``/etc/fstab``:: + + none /sys sysfs defaults 0 0 + + +In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they +appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in +``/proc/stat`` take a very different format from those in ``/proc/partitions`` +(see proc(5), if your system has it.) + +-- ricklind@us.ibm.com diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a571a67e0c85..19b1e3bef56c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5066,7 +5066,7 @@ vga= [BOOT,X86-32] Select a particular video mode See Documentation/x86/boot.rst and - Documentation/svga.txt. + Documentation/admin-guide/svga.rst. Use vga=ask for menu. This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst new file mode 100644 index 000000000000..4f18456dd3b1 --- /dev/null +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -0,0 +1,356 @@ +========================================== +Reducing OS jitter due to per-cpu kthreads +========================================== + +This document lists per-CPU kthreads in the Linux kernel and presents +options to control their OS jitter. Note that non-per-CPU kthreads are +not listed here. To reduce OS jitter from non-per-CPU kthreads, bind +them to a "housekeeping" CPU dedicated to such work. + +References +========== + +- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. + +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. + +- man taskset: Using the taskset command to bind tasks to sets + of CPUs. + +- man sched_setaffinity: Using the sched_setaffinity() system + call to bind tasks to sets of CPUs. + +- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, + writing "0" to offline and "1" to online. + +- In order to locate kernel-generated OS jitter on CPU N: + + cd /sys/kernel/debug/tracing + echo 1 > max_graph_depth # Increase the "1" for more detail + echo function_graph > current_tracer + # run workload + cat per_cpu/cpuN/trace + +kthreads +======== + +Name: + ehca_comp/%u + +Purpose: + Periodically process Infiniband-related work. + +To reduce its OS jitter, do any of the following: + +1. Don't use eHCA Infiniband hardware, instead choosing hardware + that does not require per-CPU kthreads. This will prevent these + kthreads from being created in the first place. (This will + work for most people, as this hardware, though important, is + relatively old and is produced in relatively low unit volumes.) +2. Do all eHCA-Infiniband-related work on other CPUs, including + interrupts. +3. Rework the eHCA driver so that its per-CPU kthreads are + provisioned only on selected CPUs. + + +Name: + irq/%d-%s + +Purpose: + Handle threaded interrupts. + +To reduce its OS jitter, do the following: + +1. Use irq affinity to force the irq threads to execute on + some other CPU. + +Name: + kcmtpd_ctr_%d + +Purpose: + Handle Bluetooth work. + +To reduce its OS jitter, do one of the following: + +1. Don't use Bluetooth, in which case these kthreads won't be + created in the first place. +2. Use irq affinity to force Bluetooth-related interrupts to + occur on some other CPU and furthermore initiate all + Bluetooth activity on some other CPU. + +Name: + ksoftirqd/%u + +Purpose: + Execute softirq handlers when threaded or when under heavy load. + +To reduce its OS jitter, each softirq vector must be handled +separately as follows: + +TIMER_SOFTIRQ +------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by forcing + both kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force + the CPU offline, then bring it back online. This forces + recurring timers to migrate elsewhere. If you are concerned + with multiple CPUs, force them all offline before bringing the + first one back online. Once you have onlined the CPUs in question, + do not offline any other CPUs, because doing so could force the + timer back onto one of the CPUs in question. + +NET_TX_SOFTIRQ and NET_RX_SOFTIRQ +--------------------------------- + +Do all of the following: + +1. Force networking interrupts onto other CPUs. +2. Initiate any network I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +BLOCK_SOFTIRQ +------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +IRQ_POLL_SOFTIRQ +---------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O and block-I/O polling on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +TASKLET_SOFTIRQ +--------------- + +Do one or more of the following: + +1. Avoid use of drivers that use tasklets. (Such drivers will contain + calls to things like tasklet_schedule().) +2. Convert all drivers that you must use from tasklets to workqueues. +3. Force interrupts for drivers using tasklets onto other CPUs, + and also do I/O involving these drivers on other CPUs. + +SCHED_SOFTIRQ +------------- + +Do all of the following: + +1. Avoid sending scheduler IPIs to the CPU to be de-jittered, + for example, ensure that at most one runnable kthread is present + on that CPU. If a thread that expects to run on the de-jittered + CPU awakens, the scheduler will send an IPI that can result in + a subsequent SCHED_SOFTIRQ. +2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered + is marked as an adaptive-ticks CPU using the "nohz_full=" + boot parameter. This reduces the number of scheduler-clock + interrupts that the de-jittered CPU receives, minimizing its + chances of being selected to do the load balancing work that + runs in SCHED_SOFTIRQ context. +3. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by + forcing both kernel threads and interrupts to execute elsewhere. + This further reduces the number of scheduler-clock interrupts + received by the de-jittered CPU. + +HRTIMER_SOFTIRQ +--------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle. For example, avoid system calls and force both + kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the + CPU offline, then bring it back online. This forces recurring + timers to migrate elsewhere. If you are concerned with multiple + CPUs, force them all offline before bringing the first one + back online. Once you have onlined the CPUs in question, do not + offline any other CPUs, because doing so could force the timer + back onto one of the CPUs in question. + +RCU_SOFTIRQ +----------- + +Do at least one of the following: + +1. Offload callbacks and keep the CPU in either dyntick-idle or + adaptive-ticks state by doing all of the following: + + a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be + de-jittered is marked as an adaptive-ticks CPU using the + "nohz_full=" boot parameter. Bind the rcuo kthreads to + housekeeping CPUs, which can tolerate OS jitter. + b. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +2. Enable RCU to do its processing remotely via dyntick-idle by + doing all of the following: + + a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. + b. Ensure that the CPU goes idle frequently, allowing other + CPUs to detect that it has passed through an RCU quiescent + state. If the kernel is built with CONFIG_NO_HZ_FULL=y, + userspace execution also allows other CPUs to detect that + the CPU in question has passed through a quiescent state. + c. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +Name: + kworker/%u:%d%s (cpu, id, priority) + +Purpose: + Execute workqueue requests + +To reduce its OS jitter, do any of the following: + +1. Run your workload at a real-time priority, which will allow + preempting the kworker daemons. +2. A given workqueue can be made visible in the sysfs filesystem + by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). + Such a workqueue can be confined to a given subset of the + CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs + files. The set of WQ_SYSFS workqueues can be displayed using + "ls sys/devices/virtual/workqueue". That said, the workqueues + maintainer would like to caution people against indiscriminately + sprinkling WQ_SYSFS across all the workqueues. The reason for + caution is that it is easy to add WQ_SYSFS, but because sysfs is + part of the formal user/kernel API, it can be nearly impossible + to remove it, even if its addition was a mistake. +3. Do any of the following needed to avoid jitter that your + application cannot tolerate: + + a. Build your kernel with CONFIG_SLUB=y rather than + CONFIG_SLAB=y, thus avoiding the slab allocator's periodic + use of each CPU's workqueues to run its cache_reap() + function. + b. Avoid using oprofile, thus avoiding OS jitter from + wq_sync_buffer(). + c. Limit your CPU frequency so that a CPU-frequency + governor is not required, possibly enlisting the aid of + special heatsinks or other cooling technologies. If done + correctly, and if you CPU architecture permits, you should + be able to build your kernel with CONFIG_CPU_FREQ=n to + avoid the CPU-frequency governor periodically running + on each CPU, including cs_dbs_timer() and od_dbs_timer(). + + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + d. As of v3.18, Christoph Lameter's on-demand vmstat workers + commit prevents OS jitter due to vmstat_update() on + CONFIG_SMP=y systems. Before v3.18, is not possible + to entirely get rid of the OS jitter, but you can + decrease its frequency by writing a large value to + /proc/sys/vm/stat_interval. The default value is HZ, + for an interval of one second. Of course, larger values + will make your virtual-memory statistics update more + slowly. Of course, you can also run your workload at + a real-time priority, thus preempting vmstat_update(), + but if your workload is CPU-bound, this is a bad idea. + However, there is an RFC patch from Christoph Lameter + (based on an earlier one from Gilad Ben-Yossef) that + reduces or even eliminates vmstat overhead for some + workloads at https://lkml.org/lkml/2013/9/4/379. + e. Boot with "elevator=noop" to avoid workqueue use by + the block layer. + f. If running on high-end powerpc servers, build with + CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS + daemon from running on each CPU every second or so. + (This will require editing Kconfig files and will defeat + this platform's RAS functionality.) This avoids jitter + due to the rtas_event_scan() function. + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + g. If running on Cell Processor, build your kernel with + CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from + spu_gov_work(). + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + h. If running on PowerMAC, build your kernel with + CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, + avoiding OS jitter from rackmeter_do_timer(). + +Name: + rcuc/%u + +Purpose: + Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. + +To reduce its OS jitter, do at least one of the following: + +1. Build the kernel with CONFIG_PREEMPT=n. This prevents these + kthreads from being created in the first place, and also obviates + the need for RCU priority boosting. This approach is feasible + for workloads that do not require high degrees of responsiveness. +2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these + kthreads from being created in the first place. This approach + is feasible only if your workload never requires RCU priority + boosting, for example, if you ensure frequent idle time on all + CPUs that might execute within the kernel. +3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= + boot parameter offloading RCU callbacks from all CPUs susceptible + to OS jitter. This approach prevents the rcuc/%u kthreads from + having any work to do, so that they are never awakened. +4. Ensure that the CPU never enters the kernel, and, in particular, + avoid initiating any CPU hotplug operations on this CPU. This is + another way of preventing any callbacks from being queued on the + CPU, again preventing the rcuc/%u kthreads from having any work + to do. + +Name: + rcuop/%d and rcuos/%d + +Purpose: + Offload RCU callbacks from the corresponding CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Use affinity, cgroups, or other mechanism to force these kthreads + to execute on some other CPU. +2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these + kthreads from being created in the first place. However, please + note that this will not eliminate OS jitter, but will instead + shift it to RCU_SOFTIRQ. + +Name: + watchdog/%u + +Purpose: + Detect software lockups on each CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these + kthreads from being created in the first place. +2. Boot with "nosoftlockup=0", which will also prevent these kthreads + from being created. Other related watchdog and softlockup boot + parameters may be found in Documentation/admin-guide/kernel-parameters.rst + and Documentation/watchdog/watchdog-parameters.rst. +3. Echo a zero to /proc/sys/kernel/watchdog to disable the + watchdog timer. +4. Echo a large number of /proc/sys/kernel/watchdog_thresh in + order to reduce the frequency of OS jitter due to the watchdog + timer down to a level that is acceptable for your workload. diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst new file mode 100644 index 000000000000..a3eb00c62f53 --- /dev/null +++ b/Documentation/admin-guide/lcd-panel-cgram.rst @@ -0,0 +1,27 @@ +====================================== +Parallel port LCD/Keypad Panel support +====================================== + +Some LCDs allow you to define up to 8 characters, mapped to ASCII +characters 0 to 7. The escape code to define a new character is +'\e[LG' followed by one digit from 0 to 7, representing the character +number, and up to 8 couples of hex digits terminated by a semi-colon +(';'). Each couple of digits represents a line, with 1-bits for each +illuminated pixel with LSB on the right. Lines are numbered from the +top of the character to the bottom. On a 5x7 matrix, only the 5 lower +bits of the 7 first bytes are used for each character. If the string +is incomplete, only complete lines will be redefined. Here are some +examples:: + + printf "\e[LG0010101050D1F0C04;" => 0 = [enter] + printf "\e[LG1040E1F0000000000;" => 1 = [up] + printf "\e[LG2000000001F0E0400;" => 2 = [down] + printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] + printf "\e[LG40002060E1E0E0602;" => 4 = [left] + printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] + printf "\e[LG60016051516141400;" => 6 = "IP" + + printf "\e[LG00103071F1F070301;" => big speaker + printf "\e[LG00002061E1E060200;" => small speaker + +Willy diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst new file mode 100644 index 000000000000..12c571368e73 --- /dev/null +++ b/Documentation/admin-guide/ldm.rst @@ -0,0 +1,121 @@ +========================================== +LDM - Logical Disk Manager (Dynamic Disks) +========================================== + +:Author: Originally Written by FlatCap - Richard Russon . +:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. + +Overview +-------- + +Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete +replacement for the MSDOS style partitions. It stores its information in a +1MiB journalled database at the end of the physical disk. The size of +partitions is limited only by disk space. The maximum number of partitions is +nearly 2000. + +Any partitions created under the LDM are called "Dynamic Disks". There are no +longer any primary or extended partitions. Normal MSDOS style partitions are +now known as Basic Disks. + +If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use +Dynamic Disks. The journalling allows Windows to make changes to these +partitions and filesystems without the need to reboot. + +Once the LDM driver has divided up the disk, you can use the MD driver to +assemble any multi-partition volumes, e.g. Stripes, RAID5. + +To prevent legacy applications from repartitioning the disk, the LDM creates a +dummy MSDOS partition containing one disk-sized partition. This is what is +supported with the Linux LDM driver. + +A newer approach that has been implemented with Vista is to put LDM on top of a +GPT label disk. This is not supported by the Linux LDM driver yet. + + +Example +------- + +Below we have a 50MiB disk, divided into seven partitions. + +.. note:: + + The missing 1MiB at the end of the disk is where the LDM database is + stored. + ++-------++--------------+---------+-----++--------------+---------+----+ +|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| ++=======++==============+=========+=====++==============+=========+====+ +|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| ++-------++--------------+---------+-----++--------------+---------+----+ + +The LDM Database may not store the partitions in the order that they appear on +disk, but the driver will sort them. + +When Linux boots, you will see something like:: + + hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 + hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 + + +Compiling LDM Support +--------------------- + +To enable LDM, choose the following two options: + + - "Advanced partition selection" CONFIG_PARTITION_ADVANCED + - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION + +If you believe the driver isn't working as it should, you can enable the extra +debugging code. This will produce a LOT of output. The option is: + + - "Windows LDM extra logging" CONFIG_LDM_DEBUG + +N.B. The partition code cannot be compiled as a module. + +As with all the partition code, if the driver doesn't see signs of its type of +partition, it will pass control to another driver, so there is no harm in +enabling it. + +If you have Dynamic Disks but don't enable the driver, then all you will see +is a dummy MSDOS partition filling the whole disk. You won't be able to mount +any of the volumes on the disk. + + +Booting +------- + +If you enable LDM support, then lilo is capable of booting from any of the +discovered partitions. However, grub does not understand the LDM partitioning +and cannot boot from a Dynamic Disk. + + +More Documentation +------------------ + +There is an Overview of the LDM together with complete Technical Documentation. +It is available for download. + + http://www.linux-ntfs.org/ + +If you have any LDM questions that aren't answered in the documentation, email +me. + +Cheers, + FlatCap - Richard Russon + ldm@flatcap.org + diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst new file mode 100644 index 000000000000..290840c160af --- /dev/null +++ b/Documentation/admin-guide/lockup-watchdogs.rst @@ -0,0 +1,83 @@ +=============================================================== +Softlockup detector and hardlockup detector (aka nmi_watchdog) +=============================================================== + +The Linux kernel can act as a watchdog to detect both soft and hard +lockups. + +A 'softlockup' is defined as a bug that causes the kernel to loop in +kernel mode for more than 20 seconds (see "Implementation" below for +details), without giving other tasks a chance to run. The current +stack trace is displayed upon detection and, by default, the system +will stay locked up. Alternatively, the kernel can be configured to +panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, +"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for +details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are +provided for this. + +A 'hardlockup' is defined as a bug that causes the CPU to loop in +kernel mode for more than 10 seconds (see "Implementation" below for +details), without letting other interrupts have a chance to run. +Similarly to the softlockup case, the current stack trace is displayed +upon detection and the system will stay locked up unless the default +behavior is changed, which can be done through a sysctl, +'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", +and a kernel parameter, "nmi_watchdog" +(see "Documentation/admin-guide/kernel-parameters.rst" for details). + +The panic option can be used in combination with panic_timeout (this +timeout is set through the confusingly named "kernel.panic" sysctl), +to cause the system to reboot automatically after a specified amount +of time. + +Implementation +============== + +The soft and hard lockup detectors are built on top of the hrtimer and +perf subsystems, respectively. A direct consequence of this is that, +in principle, they should work in any architecture where these +subsystems are present. + +A periodic hrtimer runs to generate interrupts and kick the watchdog +task. An NMI perf event is generated every "watchdog_thresh" +(compile-time initialized to 10 and configurable through sysctl of the +same name) seconds to check for hardlockups. If any CPU in the system +does not receive any hrtimer interrupt during that time the +'hardlockup detector' (the handler for the NMI perf event) will +generate a kernel warning or call panic, depending on the +configuration. + +The watchdog task is a high priority kernel thread that updates a +timestamp every time it is scheduled. If that timestamp is not updated +for 2*watchdog_thresh seconds (the softlockup threshold) the +'softlockup detector' (coded inside the hrtimer callback function) +will dump useful debug information to the system log, after which it +will call panic if it was instructed to do so or resume execution of +other kernel code. + +The period of the hrtimer is 2*watchdog_thresh/5, which means it has +two or three chances to generate an interrupt before the hardlockup +detector kicks in. + +As explained above, a kernel knob is provided that allows +administrators to configure the period of the hrtimer and the perf +event. The right value for a particular environment is a trade-off +between fast response to lockups and detection overhead. + +By default, the watchdog runs on all online cores. However, on a +kernel configured with NO_HZ_FULL, by default the watchdog runs only +on the housekeeping cores, not the cores specified in the "nohz_full" +boot argument. If we allowed the watchdog to run by default on +the "nohz_full" cores, we would have to run timer ticks to activate +the scheduler, which would prevent the "nohz_full" functionality +from protecting the user code on those cores from the kernel. +Of course, disabling it by default on the nohz_full cores means that +when those cores do enter the kernel, by default we will not be +able to detect if they lock up. However, allowing the watchdog +to continue to run on the housekeeping (non-tickless) cores means +that we will continue to detect lockups properly on those cores. + +In either case, the set of cores excluded from running the watchdog +may be adjusted via the kernel.watchdog_cpumask sysctl. For +nohz_full cores, this may be useful for debugging a case where the +kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst new file mode 100644 index 000000000000..4e06ffabd78a --- /dev/null +++ b/Documentation/admin-guide/mm/cma_debugfs.rst @@ -0,0 +1,25 @@ +===================== +CMA Debugfs Interface +===================== + +The CMA debugfs interface is useful to retrieve basic information out of the +different CMA areas and to test allocation/release in each of the areas. + +Each CMA zone represents a directory under /cma/, indexed by the +kernel's CMA index. So the first CMA zone would be: + + /cma/cma-0 + +The structure of the files created under that directory is as follows: + + - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. + - [RO] count: Amount of memory in the CMA area. + - [RO] order_per_bit: Order of pages represented by one bit. + - [RO] bitmap: The bitmap of page states in the zone. + - [WO] alloc: Allocate N pages from that CMA area. For example:: + + echo 5 > /cma/cma-2/alloc + +would try to allocate 5 pages from the cma-2 area. + + - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 5f61a6c429e0..11db46448354 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -26,6 +26,7 @@ the Linux memory management. :maxdepth: 1 concepts + cma_debugfs hugetlbpage idle_page_tracking ksm diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst new file mode 100644 index 000000000000..aaf1667489f8 --- /dev/null +++ b/Documentation/admin-guide/numastat.rst @@ -0,0 +1,30 @@ +=============================== +Numa policy hit/miss statistics +=============================== + +/sys/devices/system/node/node*/numastat + +All units are pages. Hugepages have separate counters. + +=============== ============================================================ +numa_hit A process wanted to allocate memory from this node, + and succeeded. + +numa_miss A process wanted to allocate memory from another node, + but ended up with memory from this node. + +numa_foreign A process wanted to allocate on this node, + but ended up with memory from another one. + +local_node A process ran on this node and got memory from it. + +other_node A process ran on this node and got memory from another node. + +interleave_hit Interleaving wanted to allocate from this node + and succeeded. +=============== ============================================================ + +For easier reading you can use the numastat utility from the numactl package +(http://oss.sgi.com/projects/libnuma/). Note that it only works +well right now on machines with a small number of CPUs. + diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst new file mode 100644 index 000000000000..bab2d10631f0 --- /dev/null +++ b/Documentation/admin-guide/pnp.rst @@ -0,0 +1,292 @@ +================================= +Linux Plug and Play Documentation +================================= + +:Author: Adam Belay +:Last updated: Oct. 16, 2002 + + +Overview +-------- + +Plug and Play provides a means of detecting and setting resources for legacy or +otherwise unconfigurable devices. The Linux Plug and Play Layer provides these +services to compatible drivers. + + +The User Interface +------------------ + +The Linux Plug and Play user interface provides a means to activate PnP devices +for legacy and user level drivers that do not support Linux Plug and Play. The +user interface is integrated into sysfs. + +In addition to the standard sysfs file the following are created in each +device's directory: +- id - displays a list of support EISA IDs +- options - displays possible resource configurations +- resources - displays currently allocated resources and allows resource changes + +activating a device +^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "auto" > resources + +this will invoke the automatic resource config system to activate the device + +manually activating a device +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "manual " > resources + + - the configuration number + - static or dynamic + static = for next boot + dynamic = now + +disabling a device +^^^^^^^^^^^^^^^^^^ + +:: + + # echo "disable" > resources + + +EXAMPLE: + +Suppose you need to activate the floppy disk controller. + +1. change to the proper directory, in my case it is + /driver/bus/pnp/devices/00:0f:: + + # cd /driver/bus/pnp/devices/00:0f + # cat name + PC standard floppy disk controller + +2. check if the device is already active:: + + # cat resources + DISABLED + + - Notice the string "DISABLED". This means the device is not active. + +3. check the device's possible configurations (optional):: + + # cat options + Dependent: 01 - Priority acceptable + port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding + port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + Dependent: 02 - Priority acceptable + port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding + port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + +4. now activate the device:: + + # echo "auto" > resources + +5. finally check if the device is active:: + + # cat resources + io 0x3f0-0x3f5 + io 0x3f7-0x3f7 + irq 6 + dma 2 + +also there are a series of kernel parameters:: + + pnp_reserve_irq=irq1[,irq2] .... + pnp_reserve_dma=dma1[,dma2] .... + pnp_reserve_io=io1,size1[,io2,size2] .... + pnp_reserve_mem=mem1,size1[,mem2,size2] .... + + + +The Unified Plug and Play Layer +------------------------------- + +All Plug and Play drivers, protocols, and services meet at a central location +called the Plug and Play Layer. This layer is responsible for the exchange of +information between PnP drivers and PnP protocols. Thus it automatically +forwards commands to the proper protocol. This makes writing PnP drivers +significantly easier. + +The following functions are available from the Plug and Play Layer: + +pnp_get_protocol + increments the number of uses by one + +pnp_put_protocol + deincrements the number of uses by one + +pnp_register_protocol + use this to register a new PnP protocol + +pnp_unregister_protocol + use this function to remove a PnP protocol from the Plug and Play Layer + +pnp_register_driver + adds a PnP driver to the Plug and Play Layer + + this includes driver model integration + returns zero for success or a negative error number for failure; count + calls to the .add() method if you need to know how many devices bind to + the driver + +pnp_unregister_driver + removes a PnP driver from the Plug and Play Layer + + + +Plug and Play Protocols +----------------------- + +This section contains information for PnP protocol developers. + +The following Protocols are currently available in the computing world: + +- PNPBIOS: + used for system devices such as serial and parallel ports. +- ISAPNP: + provides PnP support for the ISA bus +- ACPI: + among its many uses, ACPI provides information about system level + devices. + +It is meant to replace the PNPBIOS. It is not currently supported by Linux +Plug and Play but it is planned to be in the near future. + + +Requirements for a Linux PnP protocol: +1. the protocol must use EISA IDs +2. the protocol must inform the PnP Layer of a device's current configuration + +- the ability to set resources is optional but preferred. + +The following are PnP protocol related functions: + +pnp_add_device + use this function to add a PnP device to the PnP layer + + only call this function when all wanted values are set in the pnp_dev + structure + +pnp_init_device + call this to initialize the PnP structure + +pnp_remove_device + call this to remove a device from the Plug and Play Layer. + it will fail if the device is still in use. + automatically will free mem used by the device and related structures + +pnp_add_id + adds an EISA ID to the list of supported IDs for the specified device + +For more information consult the source of a protocol such as +/drivers/pnp/pnpbios/core.c. + + + +Linux Plug and Play Drivers +--------------------------- + +This section contains information for Linux PnP driver developers. + +The New Way +^^^^^^^^^^^ + +1. first make a list of supported EISA IDS + + ex:: + + static const struct pnp_id pnp_dev_table[] = { + /* Standard LPT Printer Port */ + {.id = "PNP0400", .driver_data = 0}, + /* ECP Printer Port */ + {.id = "PNP0401", .driver_data = 0}, + {.id = ""} + }; + + Please note that the character 'X' can be used as a wild card in the function + portion (last four characters). + + ex:: + + /* Unknown PnP modems */ + { "PNPCXXX", UNKNOWN_DEV }, + + Supported PnP card IDs can optionally be defined. + ex:: + + static const struct pnp_id pnp_card_table[] = { + { "ANYDEVS", 0 }, + { "", 0 } + }; + +2. Optionally define probe and remove functions. It may make sense not to + define these functions if the driver already has a reliable method of detecting + the resources, such as the parport_pc driver. + + ex:: + + static int + serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const + struct pnp_id *dev_id) + { + . . . + + ex:: + + static void serial_pnp_remove(struct pnp_dev * dev) + { + . . . + + consult /drivers/serial/8250_pnp.c for more information. + +3. create a driver structure + + ex:: + + static struct pnp_driver serial_pnp_driver = { + .name = "serial", + .card_id_table = pnp_card_table, + .id_table = pnp_dev_table, + .probe = serial_pnp_probe, + .remove = serial_pnp_remove, + }; + + * name and id_table cannot be NULL. + +4. register the driver + + ex:: + + static int __init serial8250_pnp_init(void) + { + return pnp_register_driver(&serial_pnp_driver); + } + +The Old Way +^^^^^^^^^^^ + +A series of compatibility functions have been created to make it easy to convert +ISAPNP drivers. They should serve as a temporary solution only. + +They are as follows:: + + struct pnp_card *pnp_find_card(unsigned short vendor, + unsigned short device, + struct pnp_card *from) + + struct pnp_dev *pnp_find_dev(struct pnp_card *card, + unsigned short vendor, + unsigned short function, + struct pnp_dev *from) + diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst new file mode 100644 index 000000000000..688c95b11919 --- /dev/null +++ b/Documentation/admin-guide/rtc.rst @@ -0,0 +1,140 @@ +======================================= +Real Time Clock (RTC) Drivers for Linux +======================================= + +When Linux developers talk about a "Real Time Clock", they usually mean +something that tracks wall clock time and is battery backed so that it +works even with system power off. Such clocks will normally not track +the local time zone or daylight savings time -- unless they dual boot +with MS-Windows -- but will instead be set to Coordinated Universal Time +(UTC, formerly "Greenwich Mean Time"). + +The newest non-PC hardware tends to just count seconds, like the time(2) +system call reports, but RTCs also very commonly represent time using +the Gregorian calendar and 24 hour time, as reported by gmtime(3). + +Linux has two largely-compatible userspace RTC API families you may +need to know about: + + * /dev/rtc ... is the RTC provided by PC compatible systems, + so it's not very portable to non-x86 systems. + + * /dev/rtc0, /dev/rtc1 ... are part of a framework that's + supported by a wide variety of RTC chips on all systems. + +Programmers need to understand that the PC/AT functionality is not +always available, and some systems can do much more. That is, the +RTCs use the same API to make requests in both RTC frameworks (using +different filenames of course), but the hardware may not offer the +same functionality. For example, not every RTC is hooked up to an +IRQ, so they can't all issue alarms; and where standard PC RTCs can +only issue an alarm up to 24 hours in the future, other hardware may +be able to schedule one any time in the upcoming century. + + +Old PC/AT-Compatible driver: /dev/rtc +-------------------------------------- + +All PCs (even Alpha machines) have a Real Time Clock built into them. +Usually they are built into the chipset of the computer, but some may +actually have a Motorola MC146818 (or clone) on the board. This is the +clock that keeps the date and time while your computer is turned off. + +ACPI has standardized that MC146818 functionality, and extended it in +a few ways (enabling longer alarm periods, and wake-from-hibernate). +That functionality is NOT exposed in the old driver. + +However it can also be used to generate signals from a slow 2Hz to a +relatively fast 8192Hz, in increments of powers of two. These signals +are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is +for...) It can also function as a 24hr alarm, raising IRQ 8 when the +alarm goes off. The alarm can also be programmed to only check any +subset of the three programmable values, meaning that it could be set to +ring on the 30th second of the 30th minute of every hour, for example. +The clock can also be set to generate an interrupt upon every clock +update, thus generating a 1Hz signal. + +The interrupts are reported via /dev/rtc (major 10, minor 135, read only +character device) in the form of an unsigned long. The low byte contains +the type of interrupt (update-done, alarm-rang, or periodic) that was +raised, and the remaining bytes contain the number of interrupts since +the last read. Status information is reported through the pseudo-file +/proc/driver/rtc if the /proc filesystem was enabled. The driver has +built in locking so that only one process is allowed to have the /dev/rtc +interface open at a time. + +A user process can monitor these interrupts by doing a read(2) or a +select(2) on /dev/rtc -- either will block/stop the user process until +the next interrupt is received. This is useful for things like +reasonably high frequency data acquisition where one doesn't want to +burn up 100% CPU by polling gettimeofday etc. etc. + +At high frequencies, or under high loads, the user process should check +the number of interrupts received since the last read to determine if +there has been any interrupt "pileup" so to speak. Just for reference, a +typical 486-33 running a tight read loop on /dev/rtc will start to suffer +occasional interrupt pileup (i.e. > 1 IRQ event since last read) for +frequencies above 1024Hz. So you really should check the high bytes +of the value you read, especially at frequencies above that of the +normal timer interrupt, which is 100Hz. + +Programming and/or enabling interrupt frequencies greater than 64Hz is +only allowed by root. This is perhaps a bit conservative, but we don't want +an evil user generating lots of IRQs on a slow 386sx-16, where it might have +a negative impact on performance. This 64Hz limit can be changed by writing +a different value to /proc/sys/dev/rtc/max-user-freq. Note that the +interrupt handler is only a few lines of code to minimize any possibility +of this effect. + +Also, if the kernel time is synchronized with an external source, the +kernel will write the time back to the CMOS clock every 11 minutes. In +the process of doing this, the kernel briefly turns off RTC periodic +interrupts, so be aware of this if you are doing serious work. If you +don't synchronize the kernel time with an external source (via ntp or +whatever) then the kernel will keep its hands off the RTC, allowing you +exclusive access to the device for your applications. + +The alarm and/or interrupt frequency are programmed into the RTC via +various ioctl(2) calls as listed in ./include/linux/rtc.h +Rather than write 50 pages describing the ioctl() and so on, it is +perhaps more useful to include a small test program that demonstrates +how to use them, and demonstrates the features of the driver. This is +probably a lot more useful to people interested in writing applications +that will be using this driver. See the code at the end of this document. + +(The original /dev/rtc driver was written by Paul Gortmaker.) + + +New portable "RTC Class" drivers: /dev/rtcN +-------------------------------------------- + +Because Linux supports many non-ACPI and non-PC platforms, some of which +have more than one RTC style clock, it needed a more portable solution +than expecting a single battery-backed MC146818 clone on every system. +Accordingly, a new "RTC Class" framework has been defined. It offers +three different userspace interfaces: + + * /dev/rtcN ... much the same as the older /dev/rtc interface + + * /sys/class/rtc/rtcN ... sysfs attributes support readonly + access to some RTC attributes. + + * /proc/driver/rtc ... the system clock RTC may expose itself + using a procfs interface. If there is no RTC for the system clock, + rtc0 is used by default. More information is (currently) shown + here than through sysfs. + +The RTC Class framework supports a wide variety of RTCs, ranging from those +integrated into embeddable system-on-chip (SOC) processors to discrete chips +using I2C, SPI, or some other bus to communicate with the host CPU. There's +even support for PC-style RTCs ... including the features exposed on newer PCs +through ACPI. + +The new framework also removes the "one RTC per system" restriction. For +example, maybe the low-power battery-backed RTC is a discrete I2C chip, but +a high functionality RTC is integrated into the SOC. That system might read +the system clock from the discrete RTC, but use the integrated one for all +other tasks, because of its greater functionality. + +Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the +ioctl interface. diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst new file mode 100644 index 000000000000..b6c2f9acca92 --- /dev/null +++ b/Documentation/admin-guide/svga.rst @@ -0,0 +1,249 @@ +.. include:: + +================================= +Video Mode Selection Support 2.13 +================================= + +:Copyright: |copy| 1995--1999 Martin Mares, + +Intro +~~~~~ + +This small document describes the "Video Mode Selection" feature which +allows the use of various special video modes supported by the video BIOS. Due +to usage of the BIOS, the selection is limited to boot time (before the +kernel decompression starts) and works only on 80X86 machines. + +.. note:: + + Short intro for the impatient: Just use vga=ask for the first time, + enter ``scan`` on the video mode prompt, pick the mode you want to use, + remember its mode ID (the four-digit hexadecimal number) and then + set the vga parameter to this number (converted to decimal first). + +The video mode to be used is selected by a kernel parameter which can be +specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." +option of LILO (or some other boot loader you use) or by the "vidmode" utility +(present in standard Linux utility packages). You can use the following values +of this parameter:: + + NORMAL_VGA - Standard 80x25 mode available on all display adapters. + + EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. + + ASK_VGA - Display a video mode menu upon startup (see below). + + 0..35 - Menu item number (when you have used the menu to view the list of + modes available on your adapter, you can specify the menu item you want + to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the + mode list displayed may vary as the kernel version changes, because the + modes are listed in a "first detected -- first displayed" manner. It's + better to use absolute mode numbers instead. + + 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below + for exact meaning of the ID). Warning: rdev and LILO don't support + hexadecimal numbers -- you have to convert it to decimal manually. + +Menu +~~~~ + +The ASK_VGA mode causes the kernel to offer a video mode menu upon +bootup. It displays a "Press to see video modes available, +to continue or wait 30 secs" message. If you press , you enter the +menu, if you press or wait 30 seconds, the kernel will boot up in +the standard 80x25 mode. + +The menu looks like:: + + Video adapter: + Mode: COLSxROWS: + 0 0F00 80x25 + 1 0F01 80x50 + 2 0F02 80x43 + 3 0F03 80x26 + .... + Enter mode number or ``scan``: + + tells what video adapter did Linux detect +-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA +with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection +of chipsets is turned off by default as it's inherently unreliable due to +absolutely insane PC design. + +"0 0F00 80x25" means that the first menu item (the menu items are numbered +from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the +next section for a description of mode IDs). + + encourages you to enter the item number or mode ID +you wish to set and press . If the computer complains something about +"Unknown mode ID", it is trying to tell you that it isn't possible to set such +a mode. It's also possible to press only which leaves the current mode. + +The mode list usually contains a few basic modes and some VESA modes. In +case your chipset has been detected, some chipset-specific modes are shown as +well (some of these might be missing or unusable on your machine as different +BIOSes are often shipped with the same card and the mode numbers depend purely +on the VGA BIOS). + +The modes displayed on the menu are partially sorted: The list starts with +the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and +80x43), local modes (if the local modes feature is enabled), VESA modes and +finally SVGA modes for the auto-detected adapter. + +If you are not happy with the mode list offered (e.g., if you think your card +is able to do more), you can enter "scan" instead of item number / mode ID. The +program will try to ask the BIOS for all possible video mode numbers and test +what happens then. The screen will be probably flashing wildly for some time and +strange noises will be heard from inside the monitor and so on and then, really +all consistent video modes supported by your BIOS will appear (plus maybe some +``ghost modes``). If you are afraid this could damage your monitor, don't use +this function. + +After scanning, the mode ordering is a bit different: the auto-detected SVGA +modes are not listed at all and the modes revealed by ``scan`` are shown before +all VESA modes. + +Mode IDs +~~~~~~~~ + +Because of the complexity of all the video stuff, the video mode IDs +used here are also a bit complex. A video mode ID is a 16-bit number usually +expressed in a hexadecimal notation (starting with "0x"). You can set a mode +by entering its mode directly if you know it even if it isn't shown on the menu. + +The ID numbers can be divided to those regions:: + + 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use + outside the menu as this can change from boot to boot (especially if you + have used the ``scan`` feature). + + 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number + (as presented to INT 10, function 00) increased by 0x0100. + + 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by + 0x0100. All VESA modes should be autodetected and shown on the menu. + + 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. + (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, + 945=132x28 for the standard Video7 BIOS) + + 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually + by modifying one of the standard modes). Currently available: + 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) + 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA + 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) + 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) + 0x0f04 leave current video mode + 0x0f05 VGA 80x30 (480 scans, 16-point font) + 0x0f06 VGA 80x34 (480 scans, 14-point font) + 0x0f07 VGA 80x60 (480 scans, 8-point font) + 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) + + 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" + form where RR is a number of rows and CC is a number of columns. + E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. + This is the only fully portable way to refer to a non-standard mode, + but it relies on the mode being found and displayed on the menu + (remember that mode scanning is not done automatically). + + 0xff00 to 0xffff - aliases for backward compatibility: + 0xffff equivalent to 0x0f00 (standard 80x25) + 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) + +If you add 0x8000 to the mode ID, the program will try to recalculate +vertical display timing according to mode parameters, which can be used to +eliminate some annoying bugs of certain VGA BIOSes (usually those used for +cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the +end of the display. + +Options +~~~~~~~ + +Build options for arch/x86/boot/* are selected by the kernel kconfig +utility and the kernel .config file. + +VIDEO_GFX_HACK - includes special hack for setting of graphics modes +to be used later by special drivers. +Allows to set _any_ BIOS mode including graphic ones and forcing specific +text screen resolution instead of peeking it from BIOS variables. Don't use +unless you think you know what you're doing. To activate this setup, use +mode number 0x0f08 (see the Mode IDs section above). + +Still doesn't work? +~~~~~~~~~~~~~~~~~~~ + +When the mode detection doesn't work (e.g., the mode list is incorrect or +the machine hangs instead of displaying the menu), try to switch off some of +the configuration options listed under "Options". If it fails, you can still use +your kernel with the video mode set directly via the kernel parameter. + +In either case, please send me a bug report containing what _exactly_ +happens and how do the configuration switches affect the behaviour of the bug. + +If you start Linux from M$-DOS, you might also use some DOS tools for +video mode setting. In this case, you must specify the 0x0f04 mode ("leave +current settings") to Linux, because if you don't and you use any non-standard +mode, Linux will switch to 80x25 automatically. + +If you set some extended mode and there's one or more extra lines on the +bottom of the display containing already scrolled-out text, your VGA BIOS +contains the most common video BIOS bug called "incorrect vertical display +end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, +this must be done manually -- no autodetection mechanisms are available. + +History +~~~~~~~ + +=============== ================================================================ +1.0 (??-Nov-95) First version supporting all adapters supported by the old + setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels + and then removed due to instability on some machines. +2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost + everything is configurable, the VESA support should be much more + stable, explicit mode numbering allowed, "scan" implemented etc. +2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution + supported. Few bugs fixed. VESA modes are listed prior to + modes supplied by SVGA autodetection as they are more reliable. + CLGD autodetect works better. Doesn't depend on 80x25 being + active when started. Scanning fixed. 80x43 (any VGA) added. + Code cleaned up. +2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX + VESA modes work now). Display end bug workaround supported. + Special modes renumbered to allow adding of the "recalculate" + flag, 0xffff and 0xfffe became aliases instead of real IDs. + Screen contents retained during mode changes. +2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. +2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem + with some boot loaders. Memory management rewritten to reflect + these changes. Unfortunately, screen contents retaining works + only with some loaders now. + Added a Tseng 132x60 mode. +2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. +2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on + several cards with broken VESA code (e.g., ATI VGA). +2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some + cards use very strange mode numbers. + - Added Realtek VGA modes (thanks to Gonzalo Tornaria). + - Hardware testing order slightly changed, tests based on ROM + contents done as first. + - Added support for special Video7 mode switching functions + (thanks to Tom Vander Aa). + - Added 480-scanline modes (especially useful for notebooks, + original version written by hhanemaa@cs.ruu.nl, patched by + Jeff Chua, rewritten by me). + - Screen store/restore fixed. +2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. + - Better recognition of text modes during mode scan. +2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) +2.10(11-Nov-96) - The whole thing made optional. + - Added the CONFIG_VIDEO_400_HACK switch. + - Added the CONFIG_VIDEO_GFX_HACK switch. + - Code cleanup. +2.11(03-May-97) - Yet another cleanup, now including also the documentation. + - Direct testing of SVGA adapters turned off by default, ``scan`` + offered explicitly on the prompt line. + - Removed the doc section describing adding of new probing + functions as I try to get rid of _all_ hardware probing here. +2.12(25-May-98) Added support for VESA frame buffer graphics. +2.13(14-May-99) Minor documentation fixes. +=============== ================================================================ diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a0c1d4ce403a..032c7cd3cede 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -327,7 +327,7 @@ when a hard lockup is detected. 0 - don't panic on hard lockup 1 - panic on hard lockup -See Documentation/lockup-watchdogs.txt for more information. This can +See Documentation/admin-guide/lockup-watchdogs.rst for more information. This can also be set using the nmi_watchdog kernel parameter. diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst new file mode 100644 index 000000000000..56d6fa2e2368 --- /dev/null +++ b/Documentation/admin-guide/video-output.rst @@ -0,0 +1,34 @@ +Video Output Switcher Control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +2006 luming.yu@intel.com + +The output sysfs class driver provides an abstract video output layer that +can be used to hook platform specific methods to enable/disable video output +device through common sysfs interface. For example, on my IBM ThinkPad T42 +laptop, The ACPI video driver registered its output devices and read/write +method for 'state' with output sysfs class. The user interface under sysfs is:: + + linux:/sys/class/video_output # tree . + . + |-- CRT0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- DVI0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- LCD0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + `-- TV0 + |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + |-- state + |-- subsystem -> ../../../class/video_output + `-- uevent + diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst deleted file mode 100644 index dfef50286018..000000000000 --- a/Documentation/auxdisplay/lcd-panel-cgram.rst +++ /dev/null @@ -1,29 +0,0 @@ -:orphan: - -====================================== -Parallel port LCD/Keypad Panel support -====================================== - -Some LCDs allow you to define up to 8 characters, mapped to ASCII -characters 0 to 7. The escape code to define a new character is -'\e[LG' followed by one digit from 0 to 7, representing the character -number, and up to 8 couples of hex digits terminated by a semi-colon -(';'). Each couple of digits represents a line, with 1-bits for each -illuminated pixel with LSB on the right. Lines are numbered from the -top of the character to the bottom. On a 5x7 matrix, only the 5 lower -bits of the 7 first bytes are used for each character. If the string -is incomplete, only complete lines will be redefined. Here are some -examples:: - - printf "\e[LG0010101050D1F0C04;" => 0 = [enter] - printf "\e[LG1040E1F0000000000;" => 1 = [up] - printf "\e[LG2000000001F0E0400;" => 2 = [down] - printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] - printf "\e[LG40002060E1E0E0602;" => 4 = [left] - printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] - printf "\e[LG60016051516141400;" => 6 = "IP" - - printf "\e[LG00103071F1F070301;" => big speaker - printf "\e[LG00002061E1E060200;" => small speaker - -Willy diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt deleted file mode 100644 index ec57740ead0c..000000000000 --- a/Documentation/btmrvl.txt +++ /dev/null @@ -1,124 +0,0 @@ -============= -btmrvl driver -============= - -All commands are used via debugfs interface. - -Set/get driver configurations -============================= - -Path: /debug/btmrvl/config/ - -gpiogap=[n], hscfgcmd - These commands are used to configure the host sleep parameters:: - bit 8:0 -- Gap - bit 16:8 -- GPIO - - where GPIO is the pin number of GPIO used to wake up the host. - It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface - wakeup will be used instead). - - where Gap is the gap in milli seconds between wakeup signal and - wakeup event, or 0xff for special host sleep setting. - - Usage:: - - # Use SDIO interface to wake up the host and set GAP to 0x80: - echo 0xff80 > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - - # Use GPIO pin #3 to wake up the host and set GAP to 0xff: - echo 0x03ff > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - -psmode=[n], pscmd - These commands are used to enable/disable auto sleep mode - - where the option is:: - - 1 -- Enable auto sleep mode - 0 -- Disable auto sleep mode - - Usage:: - - # Enable auto sleep mode - echo 1 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - # Disable auto sleep mode - echo 0 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - -hsmode=[n], hscmd - These commands are used to enable host sleep or wake up firmware - - where the option is:: - - 1 -- Enable host sleep - 0 -- Wake up firmware - - Usage:: - - # Enable host sleep - echo 1 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - # Wake up firmware - echo 0 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - -Get driver status -================= - -Path: /debug/btmrvl/status/ - -Usage:: - - cat /debug/btmrvl/status/ - -where the args are: - -curpsmode - This command displays current auto sleep status. - -psstate - This command display the power save state. - -hsstate - This command display the host sleep state. - -txdnldrdy - This command displays the value of Tx download ready flag. - -Issuing a raw hci command -========================= - -Use hcitool to issue raw hci command, refer to hcitool manual - -Usage:: - - Hcitool cmd [Parameters] - -Interface Control Command:: - - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface - -SD8688 firmware -=============== - -Images: - -- /lib/firmware/sd8688_helper.bin -- /lib/firmware/sd8688.bin - - -The images can be downloaded from: - -git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt deleted file mode 100644 index 211fd926cf00..000000000000 --- a/Documentation/clearing-warn-once.txt +++ /dev/null @@ -1,9 +0,0 @@ -Clearing WARN_ONCE ------------------- - -WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. - -echo 1 > /sys/kernel/debug/clear_warn_once - -clears the state and allows the warnings to print once again. -This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst deleted file mode 100644 index 518fe401b5ee..000000000000 --- a/Documentation/cma/debugfs.rst +++ /dev/null @@ -1,27 +0,0 @@ -:orphan: - -===================== -CMA Debugfs Interface -===================== - -The CMA debugfs interface is useful to retrieve basic information out of the -different CMA areas and to test allocation/release in each of the areas. - -Each CMA zone represents a directory under /cma/, indexed by the -kernel's CMA index. So the first CMA zone would be: - - /cma/cma-0 - -The structure of the files created under that directory is as follows: - - - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. - - [RO] count: Amount of memory in the CMA area. - - [RO] order_per_bit: Order of pages represented by one bit. - - [RO] bitmap: The bitmap of page states in the zone. - - [WO] alloc: Allocate N pages from that CMA area. For example:: - - echo 5 > /cma/cma-2/alloc - -would try to allocate 5 pages from the cma-2 area. - - - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt deleted file mode 100644 index 2d01ce43d2a2..000000000000 --- a/Documentation/cpu-load.txt +++ /dev/null @@ -1,114 +0,0 @@ -======== -CPU load -======== - -Linux exports various bits of information via ``/proc/stat`` and -``/proc/uptime`` that userland tools, such as top(1), use to calculate -the average time system spent in a particular state, for example:: - - $ iostat - Linux 2.6.18.3-exp (linmac) 02/20/2007 - - avg-cpu: %user %nice %system %iowait %steal %idle - 10.01 0.00 2.92 5.44 0.00 81.63 - - ... - -Here the system thinks that over the default sampling period the -system spent 10.01% of the time doing work in user space, 2.92% in the -kernel, and was overall 81.63% of the time idle. - -In most cases the ``/proc/stat`` information reflects the reality quite -closely, however due to the nature of how/when the kernel collects -this data sometimes it can not be trusted at all. - -So how is this information collected? Whenever timer interrupt is -signalled the kernel looks what kind of task was running at this -moment and increments the counter that corresponds to this tasks -kind/state. The problem with this is that the system could have -switched between various states multiple times between two timer -interrupts yet the counter is incremented only for the last state. - - -Example -------- - -If we imagine the system with one task that periodically burns cycles -in the following manner:: - - time line between two timer interrupts - |--------------------------------------| - ^ ^ - |_ something begins working | - |_ something goes to sleep - (only to be awaken quite soon) - -In the above situation the system will be 0% loaded according to the -``/proc/stat`` (since the timer interrupt will always happen when the -system is executing the idle handler), but in reality the load is -closer to 99%. - -One can imagine many more situations where this behavior of the kernel -will lead to quite erratic information inside ``/proc/stat``:: - - - /* gcc -o hog smallhog.c */ - #include - #include - #include - #include - #define HIST 10 - - static volatile sig_atomic_t stop; - - static void sighandler (int signr) - { - (void) signr; - stop = 1; - } - static unsigned long hog (unsigned long niters) - { - stop = 0; - while (!stop && --niters); - return niters; - } - int main (void) - { - int i; - struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, - .it_value = { .tv_sec = 0, .tv_usec = 1 } }; - sigset_t set; - unsigned long v[HIST]; - double tmp = 0.0; - unsigned long n; - signal (SIGALRM, &sighandler); - setitimer (ITIMER_REAL, &it, NULL); - - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) tmp += v[i]; - tmp /= HIST; - n = tmp - (tmp / 3.0); - - sigemptyset (&set); - sigaddset (&set, SIGALRM); - - for (;;) { - hog (n); - sigwait (&set, &i); - } - return 0; - } - - -References ----------- - -- http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) - - -Thanks ------- - -Con Kolivas, Pavel Machek diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt deleted file mode 100644 index b90dafcc8237..000000000000 --- a/Documentation/cputopology.txt +++ /dev/null @@ -1,177 +0,0 @@ -=========================================== -How CPU topology info is exported via sysfs -=========================================== - -Export CPU topology info via sysfs. Items (attributes) are similar -to /proc/cpuinfo output of some architectures. They reside in -/sys/devices/system/cpu/cpuX/topology/: - -physical_package_id: - - physical package id of cpuX. Typically corresponds to a physical - socket number, but the actual value is architecture and platform - dependent. - -die_id: - - the CPU die ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_id: - - the CPU core ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -book_id: - - the book ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -drawer_id: - - the drawer ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_cpus: - - internal kernel map of CPUs within the same core. - (deprecated name: "thread_siblings") - -core_cpus_list: - - human-readable list of CPUs within the same core. - (deprecated name: "thread_siblings_list"); - -package_cpus: - - internal kernel map of the CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings") - -package_cpus_list: - - human-readable list of CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings_list") - -die_cpus: - - internal kernel map of CPUs within the same die. - -die_cpus_list: - - human-readable list of CPUs within the same die. - -book_siblings: - - internal kernel map of cpuX's hardware threads within the same - book_id. - -book_siblings_list: - - human-readable list of cpuX's hardware threads within the same - book_id. - -drawer_siblings: - - internal kernel map of cpuX's hardware threads within the same - drawer_id. - -drawer_siblings_list: - - human-readable list of cpuX's hardware threads within the same - drawer_id. - -Architecture-neutral, drivers/base/topology.c, exports these attributes. -However, the book and drawer related sysfs files will only be created if -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. - -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, -where they reflect the cpu and cache hierarchy. - -For an architecture to support this feature, it must define some of -these macros in include/asm-XXX/topology.h:: - - #define topology_physical_package_id(cpu) - #define topology_die_id(cpu) - #define topology_core_id(cpu) - #define topology_book_id(cpu) - #define topology_drawer_id(cpu) - #define topology_sibling_cpumask(cpu) - #define topology_core_cpumask(cpu) - #define topology_die_cpumask(cpu) - #define topology_book_cpumask(cpu) - #define topology_drawer_cpumask(cpu) - -The type of ``**_id macros`` is int. -The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter -correspond with appropriate ``**_siblings`` sysfs attributes (except for -topology_sibling_cpumask() which corresponds with thread_siblings). - -To be consistent on all architectures, include/linux/topology.h -provides default definitions for any of the above macros that are -not defined by include/asm-XXX/topology.h: - -1) topology_physical_package_id: -1 -2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU - -For architectures that don't support books (CONFIG_SCHED_BOOK) there are no -default definitions for topology_book_id() and topology_book_cpumask(). -For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are -no default definitions for topology_drawer_id() and topology_drawer_cpumask(). - -Additionally, CPU topology information is provided under -/sys/devices/system/cpu and includes these files. The internal -source for the output is in brackets ("[]"). - - =========== ========================================================== - kernel_max: the maximum CPU index allowed by the kernel configuration. - [NR_CPUS-1] - - offline: CPUs that are not online because they have been - HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit - of CPUs allowed by the kernel configuration (kernel_max - above). [~cpu_online_mask + cpus >= NR_CPUS] - - online: CPUs that are online and being scheduled [cpu_online_mask] - - possible: CPUs that have been allocated resources and can be - brought online if they are present. [cpu_possible_mask] - - present: CPUs that have been identified as being present in the - system. [cpu_present_mask] - =========== ========================================================== - -The format for the above output is compatible with cpulist_parse() -[see ]. Some examples follow. - -In this example, there are 64 CPUs in the system but cpus 32-63 exceed -the kernel max which is limited to 0..31 by the NR_CPUS config option -being 32. Note also that CPUs 2 and 4-31 are not online but could be -brought online as they are both present and possible:: - - kernel_max: 31 - offline: 2,4-31,32-63 - online: 0-1,3 - possible: 0-31 - present: 0-31 - -In this example, the NR_CPUS config option is 128, but the kernel was -started with possible_cpus=144. There are 4 CPUs in the system and cpu2 -was manually taken offline (and is the only CPU that can be brought -online.):: - - kernel_max: 127 - offline: 2,4-127,128-143 - online: 0-1,3 - possible: 0-127 - present: 0-3 - -See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter -as well as more information on the various cpumasks. diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt deleted file mode 100644 index 833edb0d0bc4..000000000000 --- a/Documentation/efi-stub.txt +++ /dev/null @@ -1,100 +0,0 @@ -================= -The EFI Boot Stub -================= - -On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade -as a PE/COFF image, thereby convincing EFI firmware loaders to load -it as an EFI executable. The code that modifies the bzImage header, -along with the EFI-specific entry point that the firmware loader -jumps to are collectively known as the "EFI boot stub", and live in -arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, -respectively. For ARM the EFI stub is implemented in -arch/arm/boot/compressed/efi-header.S and -arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared -between architectures is in drivers/firmware/efi/libstub. - -For arm64, there is no compressed kernel support, so the Image itself -masquerades as a PE/COFF image and the EFI stub is linked into the -kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S -and drivers/firmware/efi/libstub/arm64-stub.c. - -By using the EFI boot stub it's possible to boot a Linux kernel -without the use of a conventional EFI boot loader, such as grub or -elilo. Since the EFI boot stub performs the jobs of a boot loader, in -a certain sense it *IS* the boot loader. - -The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. - - -How to install bzImage.efi --------------------------- - -The bzImage located in arch/x86/boot/bzImage must be copied to the EFI -System Partition (ESP) and renamed with the extension ".efi". Without -the extension the EFI firmware loader will refuse to execute it. It's -not possible to execute bzImage.efi from the usual Linux file systems -because EFI firmware doesn't have support for them. For ARM the -arch/arm/boot/zImage should be copied to the system partition, and it -may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image -should be copied but not necessarily renamed. - - -Passing kernel parameters from the EFI shell --------------------------------------------- - -Arguments to the kernel can be passed after bzImage.efi, e.g.:: - - fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 - - -The "initrd=" option --------------------- - -Like most boot loaders, the EFI stub allows the user to specify -multiple initrd files using the "initrd=" option. This is the only EFI -stub-specific command line parameter, everything else is passed to the -kernel when it boots. - -The path to the initrd file must be an absolute path from the -beginning of the ESP, relative path names do not work. Also, the path -is an EFI-style path and directory elements must be separated with -backslashes (\). For example, given the following directory layout:: - - fs0:> - Kernels\ - bzImage.efi - initrd-large.img - - Ramdisks\ - initrd-small.img - initrd-medium.img - -to boot with the initrd-large.img file if the current working -directory is fs0:\Kernels, the following command must be used:: - - fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img - -Notice how bzImage.efi can be specified with a relative path. That's -because the image we're executing is interpreted by the EFI shell, -which understands relative paths, whereas the rest of the command line -is passed to bzImage.efi. - - -The "dtb=" option ------------------ - -For the ARM and arm64 architectures, a device tree must be provided to -the kernel. Normally firmware shall supply the device tree via the -EFI CONFIGURATION TABLE. However, the "dtb=" command line option can -be used to override the firmware supplied device tree, or to supply -one when firmware is unable to. - -Please note: Firmware adds runtime configuration information to the -device tree before booting the kernel. If dtb= is used to override -the device tree, then any runtime data provided by firmware will be -lost. The dtb= option should only be used either as a debug tool, or -as a last resort when a device tree is not provided in the EFI -CONFIGURATION TABLE. - -"dtb=" is processed in the same manner as the "initrd=" option that is -described above. diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst index 2ed0dfb661cf..6821c87b7893 100644 --- a/Documentation/fb/vesafb.rst +++ b/Documentation/fb/vesafb.rst @@ -30,7 +30,7 @@ How to use it? ============== Switching modes is done using the vga=... boot parameter. Read -Documentation/svga.txt for details. +Documentation/admin-guide/svga.rst for details. You should compile in both vgacon (for text mode) and vesafb (for graphics mode). Which of them takes over the console depends on diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt deleted file mode 100644 index 6ee70465c0ea..000000000000 --- a/Documentation/highuid.txt +++ /dev/null @@ -1,80 +0,0 @@ -=================================================== -Notes on the change from 16-bit UIDs to 32-bit UIDs -=================================================== - -:Author: Chris Wing -:Last updated: January 11, 2000 - -- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t - when communicating between user and kernel space in an ioctl or data - structure. - -- kernel code should use uid_t and gid_t in kernel-private structures and - code. - -What's left to be done for 32-bit UIDs on all Linux architectures: - -- Disk quotas have an interesting limitation that is not related to the - maximum UID/GID. They are limited by the maximum file size on the - underlying filesystem, because quota records are written at offsets - corresponding to the UID in question. - Further investigation is needed to see if the quota system can cope - properly with huge UIDs. If it can deal with 64-bit file offsets on all - architectures, this should not be a problem. - -- Decide whether or not to keep backwards compatibility with the system - accounting file, or if we should break it as the comments suggest - (currently, the old 16-bit UID and GID are still written to disk, and - part of the former pad space is used to store separate 32-bit UID and - GID) - -- Need to validate that OS emulation calls the 16-bit UID - compatibility syscalls, if the OS being emulated used 16-bit UIDs, or - uses the 32-bit UID system calls properly otherwise. - - This affects at least: - - - iBCS on Intel - - - sparc32 emulation on sparc64 - (need to support whatever new 32-bit UID system calls are added to - sparc32) - -- Validate that all filesystems behave properly. - - At present, 32-bit UIDs _should_ work for: - - - ext2 - - ufs - - isofs - - nfs - - coda - - udf - - Ioctl() fixups have been made for: - - - ncpfs - - smbfs - - Filesystems with simple fixups to prevent 16-bit UID wraparound: - - - minix - - sysv - - qnx4 - - Other filesystems have not been checked yet. - -- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in - all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but - more are needed. (as well as new user<->kernel data structures) - -- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, - sh, and sparc32. Fixing this is probably not that important, but would - require adding a new ELF section. - -- The ioctl()s used to control the in-kernel NFS server only support - 16-bit UIDs on arm, i386, m68k, sh, and sparc32. - -- make sure that the UID mapping feature of AX25 networking works properly - (it should be safe because it's always used a 32-bit integer to - communicate between user and kernel) diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt deleted file mode 100644 index 121de96e395e..000000000000 --- a/Documentation/hw_random.txt +++ /dev/null @@ -1,105 +0,0 @@ -========================================================== -Linux support for random number generator in i8xx chipsets -========================================================== - -Introduction -============ - -The hw_random framework is software that makes use of a -special hardware feature on your CPU or motherboard, -a Random Number Generator (RNG). The software has two parts: -a core providing the /dev/hwrng character device and its -sysfs support, plus a hardware-specific driver that plugs -into that core. - -To make the most effective use of these mechanisms, you -should download the support software as well. Download the -latest version of the "rng-tools" package from the -hw_random driver's official Web site: - - http://sourceforge.net/projects/gkernel/ - -Those tools use /dev/hwrng to fill the kernel entropy pool, -which is used internally and exported by the /dev/urandom and -/dev/random special files. - -Theory of operation -=================== - -CHARACTER DEVICE. Using the standard open() -and read() system calls, you can read random data from -the hardware RNG device. This data is NOT CHECKED by any -fitness tests, and could potentially be bogus (if the -hardware is faulty or has been tampered with). Data is only -output if the hardware "has-data" flag is set, but nevertheless -a security-conscious person would run fitness tests on the -data before assuming it is truly random. - -The rng-tools package uses such tests in "rngd", and lets you -run them by hand with a "rngtest" utility. - -/dev/hwrng is char device major 10, minor 183. - -CLASS DEVICE. There is a /sys/class/misc/hw_random node with -two unique attributes, "rng_available" and "rng_current". The -"rng_available" attribute lists the hardware-specific drivers -available, while "rng_current" lists the one which is currently -connected to /dev/hwrng. If your system has more than one -RNG available, you may change the one used by writing a name from -the list in "rng_available" into "rng_current". - -========================================================================== - - -Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) - - Copyright 2000,2001 Jeff Garzik - - Copyright 2000,2001 Philipp Rumpf - - -About the Intel RNG hardware, from the firmware hub datasheet -============================================================= - -The Firmware Hub integrates a Random Number Generator (RNG) -using thermal noise generated from inherently random quantum -mechanical properties of silicon. When not generating new random -bits the RNG circuitry will enter a low power state. Intel will -provide a binary software driver to give third party software -access to our RNG for use as a security feature. At this time, -the RNG is only to be used with a system in an OS-present state. - -Intel RNG Driver notes -====================== - -FIXME: support poll(2) - -.. note:: - - request_mem_region was removed, for three reasons: - - 1) Only one RNG is supported by this driver; - 2) The location used by the RNG is a fixed location in - MMIO-addressable memory; - 3) users with properly working BIOS e820 handling will always - have the region in which the RNG is located reserved, so - request_mem_region calls always fail for proper setups. - However, for people who use mem=XX, BIOS e820 information is - **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can - succeed. - -Driver details -============== - -Based on: - Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet - May 1999 Order Number: 290658-002 R - -Intel 82802 Firmware Hub: - Random Number Generator - Programmer's Reference Manual - December 1999 Order Number: 298029-001 R - -Intel 82802 Firmware HUB Random Number Generator Driver - Copyright (c) 2000 Matt Sottek - -Special thanks to Matt Sottek. I did the "guts", he -did the "brains" and all the testing. diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt deleted file mode 100644 index 5d63b18bd6d1..000000000000 --- a/Documentation/iostats.txt +++ /dev/null @@ -1,197 +0,0 @@ -===================== -I/O statistics fields -===================== - -Since 2.4.20 (and some versions before, with patches), and 2.5.45, -more extensive disk statistics have been introduced to help measure disk -activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do -the work for you, but in case you are interested in creating your own -tools, the fields are explained here. - -In 2.4 now, the information is found as additional fields in -``/proc/partitions``. In 2.6 and upper, the same information is found in two -places: one is in the file ``/proc/diskstats``, and the other is within -the sysfs file system, which must be mounted in order to obtain -the information. Throughout this document we'll assume that sysfs -is mounted on ``/sys``, although of course it may be mounted anywhere. -Both ``/proc/diskstats`` and sysfs use the same source for the information -and so should not differ. - -Here are examples of these different formats:: - - 2.4: - 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 - - 2.6+ sysfs: - 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 35486 38030 38030 38030 - - 2.6+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 hda1 35486 38030 38030 38030 - - 4.18+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 - -On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have -a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. - -The advantage of one over the other is that the sysfs choice works well -if you are watching a known, small set of disks. ``/proc/diskstats`` may -be a better choice if you are watching a large number of disks because -you'll avoid the overhead of 50, 100, or 500 or more opens/closes with -each snapshot of your disk statistics. - -In 2.4, the statistics fields are those after the device name. In -the above example, the first field of statistics would be 446216. -By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll -find just the eleven fields, beginning with 446216. If you look at -``/proc/diskstats``, the eleven fields will be preceded by the major and -minor device numbers, and device name. Each of these formats provides -eleven fields of statistics, each meaning exactly the same things. -All fields except field 9 are cumulative since boot. Field 9 should -go to zero as I/Os complete; all others only increase (unless they -overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long -(native word size) numbers, and on a very busy or long-lived system they -may wrap. Applications should be prepared to deal with that; unless -your observations are measured in large numbers of minutes or hours, -they should not wrap twice before you notice them. - -Each set of stats only applies to the indicated device; if you want -system-wide stats you'll have to find all the devices and sum them all up. - -Field 1 -- # of reads completed - This is the total number of reads completed successfully. - -Field 2 -- # of reads merged, field 6 -- # of writes merged - Reads and writes which are adjacent to each other may be merged for - efficiency. Thus two 4K reads may become one 8K read before it is - ultimately handed to the disk, and so it will be counted (and queued) - as only one I/O. This field lets you know how often this was done. - -Field 3 -- # of sectors read - This is the total number of sectors read successfully. - -Field 4 -- # of milliseconds spent reading - This is the total number of milliseconds spent by all reads (as - measured from __make_request() to end_that_request_last()). - -Field 5 -- # of writes completed - This is the total number of writes completed successfully. - -Field 6 -- # of writes merged - See the description of field 2. - -Field 7 -- # of sectors written - This is the total number of sectors written successfully. - -Field 8 -- # of milliseconds spent writing - This is the total number of milliseconds spent by all writes (as - measured from __make_request() to end_that_request_last()). - -Field 9 -- # of I/Os currently in progress - The only field that should go to zero. Incremented as requests are - given to appropriate struct request_queue and decremented as they finish. - -Field 10 -- # of milliseconds spent doing I/Os - This field increases so long as field 9 is nonzero. - - Since 5.0 this field counts jiffies when at least one request was - started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. - -Field 11 -- weighted # of milliseconds spent doing I/Os - This field is incremented at each I/O start, I/O completion, I/O - merge, or read of these stats by the number of I/Os in progress - (field 9) times the number of milliseconds spent doing I/O since the - last update of this field. This can provide an easy measure of both - I/O completion time and the backlog that may be accumulating. - -Field 12 -- # of discards completed - This is the total number of discards completed successfully. - -Field 13 -- # of discards merged - See the description of field 2 - -Field 14 -- # of sectors discarded - This is the total number of sectors discarded successfully. - -Field 15 -- # of milliseconds spent discarding - This is the total number of milliseconds spent by all discards (as - measured from __make_request() to end_that_request_last()). - -To avoid introducing performance bottlenecks, no locks are held while -modifying these counters. This implies that minor inaccuracies may be -introduced when changes collide, so (for instance) adding up all the -read I/Os issued per partition should equal those made to the disks ... -but due to the lack of locking it may only be very close. - -In 2.6+, there are counters for each CPU, which make the lack of locking -almost a non-issue. When the statistics are read, the per-CPU counters -are summed (possibly overflowing the unsigned long variable they are -summed to) and the result given to the user. There is no convenient -user interface for accessing the per-CPU counters themselves. - -Disks vs Partitions -------------------- - -There were significant changes between 2.4 and 2.6+ in the I/O subsystem. -As a result, some statistic information disappeared. The translation from -a disk address relative to a partition to the disk address relative to -the host disk happens much earlier. All merges and timings now happen -at the disk level rather than at both the disk and partition level as -in 2.4. Consequently, you'll see a different statistics output on 2.6+ for -partitions from that for disks. There are only *four* fields available -for partitions on 2.6+ machines. This is reflected in the examples above. - -Field 1 -- # of reads issued - This is the total number of reads issued to this partition. - -Field 2 -- # of sectors read - This is the total number of sectors requested to be read from this - partition. - -Field 3 -- # of writes issued - This is the total number of writes issued to this partition. - -Field 4 -- # of sectors written - This is the total number of sectors requested to be written to - this partition. - -Note that since the address is translated to a disk-relative one, and no -record of the partition-relative address is kept, the subsequent success -or failure of the read cannot be attributed to the partition. In other -words, the number of reads for partitions is counted slightly before time -of queuing for partitions, and at completion for whole disks. This is -a subtle distinction that is probably uninteresting for most cases. - -More significant is the error induced by counting the numbers of -reads/writes before merges for partitions and after for disks. Since a -typical workload usually contains a lot of successive and adjacent requests, -the number of reads/writes issued can be several times higher than the -number of reads/writes completed. - -In 2.6.25, the full statistic set is again available for partitions and -disk and partition statistics are consistent again. Since we still don't -keep record of the partition-relative address, an operation is attributed to -the partition which contains the first sector of the request after the -eventual merges. As requests can be merged across partition, this could lead -to some (probably insignificant) inaccuracy. - -Additional notes ----------------- - -In 2.6+, sysfs is not mounted by default. If your distribution of -Linux hasn't added it already, here's the line you'll want to add to -your ``/etc/fstab``:: - - none /sys sysfs defaults 0 0 - - -In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they -appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in -``/proc/stat`` take a very different format from those in ``/proc/partitions`` -(see proc(5), if your system has it.) - --- ricklind@us.ibm.com diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt deleted file mode 100644 index 4f18456dd3b1..000000000000 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ /dev/null @@ -1,356 +0,0 @@ -========================================== -Reducing OS jitter due to per-cpu kthreads -========================================== - -This document lists per-CPU kthreads in the Linux kernel and presents -options to control their OS jitter. Note that non-per-CPU kthreads are -not listed here. To reduce OS jitter from non-per-CPU kthreads, bind -them to a "housekeeping" CPU dedicated to such work. - -References -========== - -- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. - -- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - -- man taskset: Using the taskset command to bind tasks to sets - of CPUs. - -- man sched_setaffinity: Using the sched_setaffinity() system - call to bind tasks to sets of CPUs. - -- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, - writing "0" to offline and "1" to online. - -- In order to locate kernel-generated OS jitter on CPU N: - - cd /sys/kernel/debug/tracing - echo 1 > max_graph_depth # Increase the "1" for more detail - echo function_graph > current_tracer - # run workload - cat per_cpu/cpuN/trace - -kthreads -======== - -Name: - ehca_comp/%u - -Purpose: - Periodically process Infiniband-related work. - -To reduce its OS jitter, do any of the following: - -1. Don't use eHCA Infiniband hardware, instead choosing hardware - that does not require per-CPU kthreads. This will prevent these - kthreads from being created in the first place. (This will - work for most people, as this hardware, though important, is - relatively old and is produced in relatively low unit volumes.) -2. Do all eHCA-Infiniband-related work on other CPUs, including - interrupts. -3. Rework the eHCA driver so that its per-CPU kthreads are - provisioned only on selected CPUs. - - -Name: - irq/%d-%s - -Purpose: - Handle threaded interrupts. - -To reduce its OS jitter, do the following: - -1. Use irq affinity to force the irq threads to execute on - some other CPU. - -Name: - kcmtpd_ctr_%d - -Purpose: - Handle Bluetooth work. - -To reduce its OS jitter, do one of the following: - -1. Don't use Bluetooth, in which case these kthreads won't be - created in the first place. -2. Use irq affinity to force Bluetooth-related interrupts to - occur on some other CPU and furthermore initiate all - Bluetooth activity on some other CPU. - -Name: - ksoftirqd/%u - -Purpose: - Execute softirq handlers when threaded or when under heavy load. - -To reduce its OS jitter, each softirq vector must be handled -separately as follows: - -TIMER_SOFTIRQ -------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by forcing - both kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force - the CPU offline, then bring it back online. This forces - recurring timers to migrate elsewhere. If you are concerned - with multiple CPUs, force them all offline before bringing the - first one back online. Once you have onlined the CPUs in question, - do not offline any other CPUs, because doing so could force the - timer back onto one of the CPUs in question. - -NET_TX_SOFTIRQ and NET_RX_SOFTIRQ ---------------------------------- - -Do all of the following: - -1. Force networking interrupts onto other CPUs. -2. Initiate any network I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -BLOCK_SOFTIRQ -------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -IRQ_POLL_SOFTIRQ ----------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O and block-I/O polling on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -TASKLET_SOFTIRQ ---------------- - -Do one or more of the following: - -1. Avoid use of drivers that use tasklets. (Such drivers will contain - calls to things like tasklet_schedule().) -2. Convert all drivers that you must use from tasklets to workqueues. -3. Force interrupts for drivers using tasklets onto other CPUs, - and also do I/O involving these drivers on other CPUs. - -SCHED_SOFTIRQ -------------- - -Do all of the following: - -1. Avoid sending scheduler IPIs to the CPU to be de-jittered, - for example, ensure that at most one runnable kthread is present - on that CPU. If a thread that expects to run on the de-jittered - CPU awakens, the scheduler will send an IPI that can result in - a subsequent SCHED_SOFTIRQ. -2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered - is marked as an adaptive-ticks CPU using the "nohz_full=" - boot parameter. This reduces the number of scheduler-clock - interrupts that the de-jittered CPU receives, minimizing its - chances of being selected to do the load balancing work that - runs in SCHED_SOFTIRQ context. -3. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by - forcing both kernel threads and interrupts to execute elsewhere. - This further reduces the number of scheduler-clock interrupts - received by the de-jittered CPU. - -HRTIMER_SOFTIRQ ---------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle. For example, avoid system calls and force both - kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the - CPU offline, then bring it back online. This forces recurring - timers to migrate elsewhere. If you are concerned with multiple - CPUs, force them all offline before bringing the first one - back online. Once you have onlined the CPUs in question, do not - offline any other CPUs, because doing so could force the timer - back onto one of the CPUs in question. - -RCU_SOFTIRQ ------------ - -Do at least one of the following: - -1. Offload callbacks and keep the CPU in either dyntick-idle or - adaptive-ticks state by doing all of the following: - - a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be - de-jittered is marked as an adaptive-ticks CPU using the - "nohz_full=" boot parameter. Bind the rcuo kthreads to - housekeeping CPUs, which can tolerate OS jitter. - b. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -2. Enable RCU to do its processing remotely via dyntick-idle by - doing all of the following: - - a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. - b. Ensure that the CPU goes idle frequently, allowing other - CPUs to detect that it has passed through an RCU quiescent - state. If the kernel is built with CONFIG_NO_HZ_FULL=y, - userspace execution also allows other CPUs to detect that - the CPU in question has passed through a quiescent state. - c. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -Name: - kworker/%u:%d%s (cpu, id, priority) - -Purpose: - Execute workqueue requests - -To reduce its OS jitter, do any of the following: - -1. Run your workload at a real-time priority, which will allow - preempting the kworker daemons. -2. A given workqueue can be made visible in the sysfs filesystem - by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). - Such a workqueue can be confined to a given subset of the - CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs - files. The set of WQ_SYSFS workqueues can be displayed using - "ls sys/devices/virtual/workqueue". That said, the workqueues - maintainer would like to caution people against indiscriminately - sprinkling WQ_SYSFS across all the workqueues. The reason for - caution is that it is easy to add WQ_SYSFS, but because sysfs is - part of the formal user/kernel API, it can be nearly impossible - to remove it, even if its addition was a mistake. -3. Do any of the following needed to avoid jitter that your - application cannot tolerate: - - a. Build your kernel with CONFIG_SLUB=y rather than - CONFIG_SLAB=y, thus avoiding the slab allocator's periodic - use of each CPU's workqueues to run its cache_reap() - function. - b. Avoid using oprofile, thus avoiding OS jitter from - wq_sync_buffer(). - c. Limit your CPU frequency so that a CPU-frequency - governor is not required, possibly enlisting the aid of - special heatsinks or other cooling technologies. If done - correctly, and if you CPU architecture permits, you should - be able to build your kernel with CONFIG_CPU_FREQ=n to - avoid the CPU-frequency governor periodically running - on each CPU, including cs_dbs_timer() and od_dbs_timer(). - - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - d. As of v3.18, Christoph Lameter's on-demand vmstat workers - commit prevents OS jitter due to vmstat_update() on - CONFIG_SMP=y systems. Before v3.18, is not possible - to entirely get rid of the OS jitter, but you can - decrease its frequency by writing a large value to - /proc/sys/vm/stat_interval. The default value is HZ, - for an interval of one second. Of course, larger values - will make your virtual-memory statistics update more - slowly. Of course, you can also run your workload at - a real-time priority, thus preempting vmstat_update(), - but if your workload is CPU-bound, this is a bad idea. - However, there is an RFC patch from Christoph Lameter - (based on an earlier one from Gilad Ben-Yossef) that - reduces or even eliminates vmstat overhead for some - workloads at https://lkml.org/lkml/2013/9/4/379. - e. Boot with "elevator=noop" to avoid workqueue use by - the block layer. - f. If running on high-end powerpc servers, build with - CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS - daemon from running on each CPU every second or so. - (This will require editing Kconfig files and will defeat - this platform's RAS functionality.) This avoids jitter - due to the rtas_event_scan() function. - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - g. If running on Cell Processor, build your kernel with - CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from - spu_gov_work(). - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - h. If running on PowerMAC, build your kernel with - CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, - avoiding OS jitter from rackmeter_do_timer(). - -Name: - rcuc/%u - -Purpose: - Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. - -To reduce its OS jitter, do at least one of the following: - -1. Build the kernel with CONFIG_PREEMPT=n. This prevents these - kthreads from being created in the first place, and also obviates - the need for RCU priority boosting. This approach is feasible - for workloads that do not require high degrees of responsiveness. -2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these - kthreads from being created in the first place. This approach - is feasible only if your workload never requires RCU priority - boosting, for example, if you ensure frequent idle time on all - CPUs that might execute within the kernel. -3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= - boot parameter offloading RCU callbacks from all CPUs susceptible - to OS jitter. This approach prevents the rcuc/%u kthreads from - having any work to do, so that they are never awakened. -4. Ensure that the CPU never enters the kernel, and, in particular, - avoid initiating any CPU hotplug operations on this CPU. This is - another way of preventing any callbacks from being queued on the - CPU, again preventing the rcuc/%u kthreads from having any work - to do. - -Name: - rcuop/%d and rcuos/%d - -Purpose: - Offload RCU callbacks from the corresponding CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Use affinity, cgroups, or other mechanism to force these kthreads - to execute on some other CPU. -2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these - kthreads from being created in the first place. However, please - note that this will not eliminate OS jitter, but will instead - shift it to RCU_SOFTIRQ. - -Name: - watchdog/%u - -Purpose: - Detect software lockups on each CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these - kthreads from being created in the first place. -2. Boot with "nosoftlockup=0", which will also prevent these kthreads - from being created. Other related watchdog and softlockup boot - parameters may be found in Documentation/admin-guide/kernel-parameters.rst - and Documentation/watchdog/watchdog-parameters.rst. -3. Echo a zero to /proc/sys/kernel/watchdog to disable the - watchdog timer. -4. Echo a large number of /proc/sys/kernel/watchdog_thresh in - order to reduce the frequency of OS jitter due to the watchdog - timer down to a level that is acceptable for your workload. diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt deleted file mode 100644 index 12c571368e73..000000000000 --- a/Documentation/ldm.txt +++ /dev/null @@ -1,121 +0,0 @@ -========================================== -LDM - Logical Disk Manager (Dynamic Disks) -========================================== - -:Author: Originally Written by FlatCap - Richard Russon . -:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. - -Overview --------- - -Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete -replacement for the MSDOS style partitions. It stores its information in a -1MiB journalled database at the end of the physical disk. The size of -partitions is limited only by disk space. The maximum number of partitions is -nearly 2000. - -Any partitions created under the LDM are called "Dynamic Disks". There are no -longer any primary or extended partitions. Normal MSDOS style partitions are -now known as Basic Disks. - -If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use -Dynamic Disks. The journalling allows Windows to make changes to these -partitions and filesystems without the need to reboot. - -Once the LDM driver has divided up the disk, you can use the MD driver to -assemble any multi-partition volumes, e.g. Stripes, RAID5. - -To prevent legacy applications from repartitioning the disk, the LDM creates a -dummy MSDOS partition containing one disk-sized partition. This is what is -supported with the Linux LDM driver. - -A newer approach that has been implemented with Vista is to put LDM on top of a -GPT label disk. This is not supported by the Linux LDM driver yet. - - -Example -------- - -Below we have a 50MiB disk, divided into seven partitions. - -.. note:: - - The missing 1MiB at the end of the disk is where the LDM database is - stored. - -+-------++--------------+---------+-----++--------------+---------+----+ -|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| -+=======++==============+=========+=====++==============+=========+====+ -|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| -+-------++--------------+---------+-----++--------------+---------+----+ - -The LDM Database may not store the partitions in the order that they appear on -disk, but the driver will sort them. - -When Linux boots, you will see something like:: - - hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 - hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 - - -Compiling LDM Support ---------------------- - -To enable LDM, choose the following two options: - - - "Advanced partition selection" CONFIG_PARTITION_ADVANCED - - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION - -If you believe the driver isn't working as it should, you can enable the extra -debugging code. This will produce a LOT of output. The option is: - - - "Windows LDM extra logging" CONFIG_LDM_DEBUG - -N.B. The partition code cannot be compiled as a module. - -As with all the partition code, if the driver doesn't see signs of its type of -partition, it will pass control to another driver, so there is no harm in -enabling it. - -If you have Dynamic Disks but don't enable the driver, then all you will see -is a dummy MSDOS partition filling the whole disk. You won't be able to mount -any of the volumes on the disk. - - -Booting -------- - -If you enable LDM support, then lilo is capable of booting from any of the -discovered partitions. However, grub does not understand the LDM partitioning -and cannot boot from a Dynamic Disk. - - -More Documentation ------------------- - -There is an Overview of the LDM together with complete Technical Documentation. -It is available for download. - - http://www.linux-ntfs.org/ - -If you have any LDM questions that aren't answered in the documentation, email -me. - -Cheers, - FlatCap - Richard Russon - ldm@flatcap.org - diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt deleted file mode 100644 index 290840c160af..000000000000 --- a/Documentation/lockup-watchdogs.txt +++ /dev/null @@ -1,83 +0,0 @@ -=============================================================== -Softlockup detector and hardlockup detector (aka nmi_watchdog) -=============================================================== - -The Linux kernel can act as a watchdog to detect both soft and hard -lockups. - -A 'softlockup' is defined as a bug that causes the kernel to loop in -kernel mode for more than 20 seconds (see "Implementation" below for -details), without giving other tasks a chance to run. The current -stack trace is displayed upon detection and, by default, the system -will stay locked up. Alternatively, the kernel can be configured to -panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, -"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for -details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are -provided for this. - -A 'hardlockup' is defined as a bug that causes the CPU to loop in -kernel mode for more than 10 seconds (see "Implementation" below for -details), without letting other interrupts have a chance to run. -Similarly to the softlockup case, the current stack trace is displayed -upon detection and the system will stay locked up unless the default -behavior is changed, which can be done through a sysctl, -'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", -and a kernel parameter, "nmi_watchdog" -(see "Documentation/admin-guide/kernel-parameters.rst" for details). - -The panic option can be used in combination with panic_timeout (this -timeout is set through the confusingly named "kernel.panic" sysctl), -to cause the system to reboot automatically after a specified amount -of time. - -Implementation -============== - -The soft and hard lockup detectors are built on top of the hrtimer and -perf subsystems, respectively. A direct consequence of this is that, -in principle, they should work in any architecture where these -subsystems are present. - -A periodic hrtimer runs to generate interrupts and kick the watchdog -task. An NMI perf event is generated every "watchdog_thresh" -(compile-time initialized to 10 and configurable through sysctl of the -same name) seconds to check for hardlockups. If any CPU in the system -does not receive any hrtimer interrupt during that time the -'hardlockup detector' (the handler for the NMI perf event) will -generate a kernel warning or call panic, depending on the -configuration. - -The watchdog task is a high priority kernel thread that updates a -timestamp every time it is scheduled. If that timestamp is not updated -for 2*watchdog_thresh seconds (the softlockup threshold) the -'softlockup detector' (coded inside the hrtimer callback function) -will dump useful debug information to the system log, after which it -will call panic if it was instructed to do so or resume execution of -other kernel code. - -The period of the hrtimer is 2*watchdog_thresh/5, which means it has -two or three chances to generate an interrupt before the hardlockup -detector kicks in. - -As explained above, a kernel knob is provided that allows -administrators to configure the period of the hrtimer and the perf -event. The right value for a particular environment is a trade-off -between fast response to lockups and detection overhead. - -By default, the watchdog runs on all online cores. However, on a -kernel configured with NO_HZ_FULL, by default the watchdog runs only -on the housekeeping cores, not the cores specified in the "nohz_full" -boot argument. If we allowed the watchdog to run by default on -the "nohz_full" cores, we would have to run timer ticks to activate -the scheduler, which would prevent the "nohz_full" functionality -from protecting the user code on those cores from the kernel. -Of course, disabling it by default on the nohz_full cores means that -when those cores do enter the kernel, by default we will not be -able to detect if they lock up. However, allowing the watchdog -to continue to run on the housekeeping (non-tickless) cores means -that we will continue to detect lockups properly on those cores. - -In either case, the set of cores excluded from running the watchdog -may be adjusted via the kernel.watchdog_cpumask sysctl. For -nohz_full cores, this may be useful for debugging a case where the -kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt deleted file mode 100644 index aaf1667489f8..000000000000 --- a/Documentation/numastat.txt +++ /dev/null @@ -1,30 +0,0 @@ -=============================== -Numa policy hit/miss statistics -=============================== - -/sys/devices/system/node/node*/numastat - -All units are pages. Hugepages have separate counters. - -=============== ============================================================ -numa_hit A process wanted to allocate memory from this node, - and succeeded. - -numa_miss A process wanted to allocate memory from another node, - but ended up with memory from this node. - -numa_foreign A process wanted to allocate on this node, - but ended up with memory from another one. - -local_node A process ran on this node and got memory from it. - -other_node A process ran on this node and got memory from another node. - -interleave_hit Interleaving wanted to allocate from this node - and succeeded. -=============== ============================================================ - -For easier reading you can use the numastat utility from the numactl package -(http://oss.sgi.com/projects/libnuma/). Note that it only works -well right now on machines with a small number of CPUs. - diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt deleted file mode 100644 index bab2d10631f0..000000000000 --- a/Documentation/pnp.txt +++ /dev/null @@ -1,292 +0,0 @@ -================================= -Linux Plug and Play Documentation -================================= - -:Author: Adam Belay -:Last updated: Oct. 16, 2002 - - -Overview --------- - -Plug and Play provides a means of detecting and setting resources for legacy or -otherwise unconfigurable devices. The Linux Plug and Play Layer provides these -services to compatible drivers. - - -The User Interface ------------------- - -The Linux Plug and Play user interface provides a means to activate PnP devices -for legacy and user level drivers that do not support Linux Plug and Play. The -user interface is integrated into sysfs. - -In addition to the standard sysfs file the following are created in each -device's directory: -- id - displays a list of support EISA IDs -- options - displays possible resource configurations -- resources - displays currently allocated resources and allows resource changes - -activating a device -^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "auto" > resources - -this will invoke the automatic resource config system to activate the device - -manually activating a device -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "manual " > resources - - - the configuration number - - static or dynamic - static = for next boot - dynamic = now - -disabling a device -^^^^^^^^^^^^^^^^^^ - -:: - - # echo "disable" > resources - - -EXAMPLE: - -Suppose you need to activate the floppy disk controller. - -1. change to the proper directory, in my case it is - /driver/bus/pnp/devices/00:0f:: - - # cd /driver/bus/pnp/devices/00:0f - # cat name - PC standard floppy disk controller - -2. check if the device is already active:: - - # cat resources - DISABLED - - - Notice the string "DISABLED". This means the device is not active. - -3. check the device's possible configurations (optional):: - - # cat options - Dependent: 01 - Priority acceptable - port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding - port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - Dependent: 02 - Priority acceptable - port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding - port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - -4. now activate the device:: - - # echo "auto" > resources - -5. finally check if the device is active:: - - # cat resources - io 0x3f0-0x3f5 - io 0x3f7-0x3f7 - irq 6 - dma 2 - -also there are a series of kernel parameters:: - - pnp_reserve_irq=irq1[,irq2] .... - pnp_reserve_dma=dma1[,dma2] .... - pnp_reserve_io=io1,size1[,io2,size2] .... - pnp_reserve_mem=mem1,size1[,mem2,size2] .... - - - -The Unified Plug and Play Layer -------------------------------- - -All Plug and Play drivers, protocols, and services meet at a central location -called the Plug and Play Layer. This layer is responsible for the exchange of -information between PnP drivers and PnP protocols. Thus it automatically -forwards commands to the proper protocol. This makes writing PnP drivers -significantly easier. - -The following functions are available from the Plug and Play Layer: - -pnp_get_protocol - increments the number of uses by one - -pnp_put_protocol - deincrements the number of uses by one - -pnp_register_protocol - use this to register a new PnP protocol - -pnp_unregister_protocol - use this function to remove a PnP protocol from the Plug and Play Layer - -pnp_register_driver - adds a PnP driver to the Plug and Play Layer - - this includes driver model integration - returns zero for success or a negative error number for failure; count - calls to the .add() method if you need to know how many devices bind to - the driver - -pnp_unregister_driver - removes a PnP driver from the Plug and Play Layer - - - -Plug and Play Protocols ------------------------ - -This section contains information for PnP protocol developers. - -The following Protocols are currently available in the computing world: - -- PNPBIOS: - used for system devices such as serial and parallel ports. -- ISAPNP: - provides PnP support for the ISA bus -- ACPI: - among its many uses, ACPI provides information about system level - devices. - -It is meant to replace the PNPBIOS. It is not currently supported by Linux -Plug and Play but it is planned to be in the near future. - - -Requirements for a Linux PnP protocol: -1. the protocol must use EISA IDs -2. the protocol must inform the PnP Layer of a device's current configuration - -- the ability to set resources is optional but preferred. - -The following are PnP protocol related functions: - -pnp_add_device - use this function to add a PnP device to the PnP layer - - only call this function when all wanted values are set in the pnp_dev - structure - -pnp_init_device - call this to initialize the PnP structure - -pnp_remove_device - call this to remove a device from the Plug and Play Layer. - it will fail if the device is still in use. - automatically will free mem used by the device and related structures - -pnp_add_id - adds an EISA ID to the list of supported IDs for the specified device - -For more information consult the source of a protocol such as -/drivers/pnp/pnpbios/core.c. - - - -Linux Plug and Play Drivers ---------------------------- - -This section contains information for Linux PnP driver developers. - -The New Way -^^^^^^^^^^^ - -1. first make a list of supported EISA IDS - - ex:: - - static const struct pnp_id pnp_dev_table[] = { - /* Standard LPT Printer Port */ - {.id = "PNP0400", .driver_data = 0}, - /* ECP Printer Port */ - {.id = "PNP0401", .driver_data = 0}, - {.id = ""} - }; - - Please note that the character 'X' can be used as a wild card in the function - portion (last four characters). - - ex:: - - /* Unknown PnP modems */ - { "PNPCXXX", UNKNOWN_DEV }, - - Supported PnP card IDs can optionally be defined. - ex:: - - static const struct pnp_id pnp_card_table[] = { - { "ANYDEVS", 0 }, - { "", 0 } - }; - -2. Optionally define probe and remove functions. It may make sense not to - define these functions if the driver already has a reliable method of detecting - the resources, such as the parport_pc driver. - - ex:: - - static int - serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const - struct pnp_id *dev_id) - { - . . . - - ex:: - - static void serial_pnp_remove(struct pnp_dev * dev) - { - . . . - - consult /drivers/serial/8250_pnp.c for more information. - -3. create a driver structure - - ex:: - - static struct pnp_driver serial_pnp_driver = { - .name = "serial", - .card_id_table = pnp_card_table, - .id_table = pnp_dev_table, - .probe = serial_pnp_probe, - .remove = serial_pnp_remove, - }; - - * name and id_table cannot be NULL. - -4. register the driver - - ex:: - - static int __init serial8250_pnp_init(void) - { - return pnp_register_driver(&serial_pnp_driver); - } - -The Old Way -^^^^^^^^^^^ - -A series of compatibility functions have been created to make it easy to convert -ISAPNP drivers. They should serve as a temporary solution only. - -They are as follows:: - - struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from) - - struct pnp_dev *pnp_find_dev(struct pnp_card *card, - unsigned short vendor, - unsigned short function, - struct pnp_dev *from) - diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt deleted file mode 100644 index 688c95b11919..000000000000 --- a/Documentation/rtc.txt +++ /dev/null @@ -1,140 +0,0 @@ -======================================= -Real Time Clock (RTC) Drivers for Linux -======================================= - -When Linux developers talk about a "Real Time Clock", they usually mean -something that tracks wall clock time and is battery backed so that it -works even with system power off. Such clocks will normally not track -the local time zone or daylight savings time -- unless they dual boot -with MS-Windows -- but will instead be set to Coordinated Universal Time -(UTC, formerly "Greenwich Mean Time"). - -The newest non-PC hardware tends to just count seconds, like the time(2) -system call reports, but RTCs also very commonly represent time using -the Gregorian calendar and 24 hour time, as reported by gmtime(3). - -Linux has two largely-compatible userspace RTC API families you may -need to know about: - - * /dev/rtc ... is the RTC provided by PC compatible systems, - so it's not very portable to non-x86 systems. - - * /dev/rtc0, /dev/rtc1 ... are part of a framework that's - supported by a wide variety of RTC chips on all systems. - -Programmers need to understand that the PC/AT functionality is not -always available, and some systems can do much more. That is, the -RTCs use the same API to make requests in both RTC frameworks (using -different filenames of course), but the hardware may not offer the -same functionality. For example, not every RTC is hooked up to an -IRQ, so they can't all issue alarms; and where standard PC RTCs can -only issue an alarm up to 24 hours in the future, other hardware may -be able to schedule one any time in the upcoming century. - - -Old PC/AT-Compatible driver: /dev/rtc --------------------------------------- - -All PCs (even Alpha machines) have a Real Time Clock built into them. -Usually they are built into the chipset of the computer, but some may -actually have a Motorola MC146818 (or clone) on the board. This is the -clock that keeps the date and time while your computer is turned off. - -ACPI has standardized that MC146818 functionality, and extended it in -a few ways (enabling longer alarm periods, and wake-from-hibernate). -That functionality is NOT exposed in the old driver. - -However it can also be used to generate signals from a slow 2Hz to a -relatively fast 8192Hz, in increments of powers of two. These signals -are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is -for...) It can also function as a 24hr alarm, raising IRQ 8 when the -alarm goes off. The alarm can also be programmed to only check any -subset of the three programmable values, meaning that it could be set to -ring on the 30th second of the 30th minute of every hour, for example. -The clock can also be set to generate an interrupt upon every clock -update, thus generating a 1Hz signal. - -The interrupts are reported via /dev/rtc (major 10, minor 135, read only -character device) in the form of an unsigned long. The low byte contains -the type of interrupt (update-done, alarm-rang, or periodic) that was -raised, and the remaining bytes contain the number of interrupts since -the last read. Status information is reported through the pseudo-file -/proc/driver/rtc if the /proc filesystem was enabled. The driver has -built in locking so that only one process is allowed to have the /dev/rtc -interface open at a time. - -A user process can monitor these interrupts by doing a read(2) or a -select(2) on /dev/rtc -- either will block/stop the user process until -the next interrupt is received. This is useful for things like -reasonably high frequency data acquisition where one doesn't want to -burn up 100% CPU by polling gettimeofday etc. etc. - -At high frequencies, or under high loads, the user process should check -the number of interrupts received since the last read to determine if -there has been any interrupt "pileup" so to speak. Just for reference, a -typical 486-33 running a tight read loop on /dev/rtc will start to suffer -occasional interrupt pileup (i.e. > 1 IRQ event since last read) for -frequencies above 1024Hz. So you really should check the high bytes -of the value you read, especially at frequencies above that of the -normal timer interrupt, which is 100Hz. - -Programming and/or enabling interrupt frequencies greater than 64Hz is -only allowed by root. This is perhaps a bit conservative, but we don't want -an evil user generating lots of IRQs on a slow 386sx-16, where it might have -a negative impact on performance. This 64Hz limit can be changed by writing -a different value to /proc/sys/dev/rtc/max-user-freq. Note that the -interrupt handler is only a few lines of code to minimize any possibility -of this effect. - -Also, if the kernel time is synchronized with an external source, the -kernel will write the time back to the CMOS clock every 11 minutes. In -the process of doing this, the kernel briefly turns off RTC periodic -interrupts, so be aware of this if you are doing serious work. If you -don't synchronize the kernel time with an external source (via ntp or -whatever) then the kernel will keep its hands off the RTC, allowing you -exclusive access to the device for your applications. - -The alarm and/or interrupt frequency are programmed into the RTC via -various ioctl(2) calls as listed in ./include/linux/rtc.h -Rather than write 50 pages describing the ioctl() and so on, it is -perhaps more useful to include a small test program that demonstrates -how to use them, and demonstrates the features of the driver. This is -probably a lot more useful to people interested in writing applications -that will be using this driver. See the code at the end of this document. - -(The original /dev/rtc driver was written by Paul Gortmaker.) - - -New portable "RTC Class" drivers: /dev/rtcN --------------------------------------------- - -Because Linux supports many non-ACPI and non-PC platforms, some of which -have more than one RTC style clock, it needed a more portable solution -than expecting a single battery-backed MC146818 clone on every system. -Accordingly, a new "RTC Class" framework has been defined. It offers -three different userspace interfaces: - - * /dev/rtcN ... much the same as the older /dev/rtc interface - - * /sys/class/rtc/rtcN ... sysfs attributes support readonly - access to some RTC attributes. - - * /proc/driver/rtc ... the system clock RTC may expose itself - using a procfs interface. If there is no RTC for the system clock, - rtc0 is used by default. More information is (currently) shown - here than through sysfs. - -The RTC Class framework supports a wide variety of RTCs, ranging from those -integrated into embeddable system-on-chip (SOC) processors to discrete chips -using I2C, SPI, or some other bus to communicate with the host CPU. There's -even support for PC-style RTCs ... including the features exposed on newer PCs -through ACPI. - -The new framework also removes the "one RTC per system" restriction. For -example, maybe the low-power battery-backed RTC is a discrete I2C chip, but -a high functionality RTC is integrated into the SOC. That system might read -the system clock from the discrete RTC, but use the integrated one for all -other tasks, because of its greater functionality. - -Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the -ioctl interface. diff --git a/Documentation/svga.txt b/Documentation/svga.txt deleted file mode 100644 index b6c2f9acca92..000000000000 --- a/Documentation/svga.txt +++ /dev/null @@ -1,249 +0,0 @@ -.. include:: - -================================= -Video Mode Selection Support 2.13 -================================= - -:Copyright: |copy| 1995--1999 Martin Mares, - -Intro -~~~~~ - -This small document describes the "Video Mode Selection" feature which -allows the use of various special video modes supported by the video BIOS. Due -to usage of the BIOS, the selection is limited to boot time (before the -kernel decompression starts) and works only on 80X86 machines. - -.. note:: - - Short intro for the impatient: Just use vga=ask for the first time, - enter ``scan`` on the video mode prompt, pick the mode you want to use, - remember its mode ID (the four-digit hexadecimal number) and then - set the vga parameter to this number (converted to decimal first). - -The video mode to be used is selected by a kernel parameter which can be -specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." -option of LILO (or some other boot loader you use) or by the "vidmode" utility -(present in standard Linux utility packages). You can use the following values -of this parameter:: - - NORMAL_VGA - Standard 80x25 mode available on all display adapters. - - EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. - - ASK_VGA - Display a video mode menu upon startup (see below). - - 0..35 - Menu item number (when you have used the menu to view the list of - modes available on your adapter, you can specify the menu item you want - to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the - mode list displayed may vary as the kernel version changes, because the - modes are listed in a "first detected -- first displayed" manner. It's - better to use absolute mode numbers instead. - - 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below - for exact meaning of the ID). Warning: rdev and LILO don't support - hexadecimal numbers -- you have to convert it to decimal manually. - -Menu -~~~~ - -The ASK_VGA mode causes the kernel to offer a video mode menu upon -bootup. It displays a "Press to see video modes available, -to continue or wait 30 secs" message. If you press , you enter the -menu, if you press or wait 30 seconds, the kernel will boot up in -the standard 80x25 mode. - -The menu looks like:: - - Video adapter: - Mode: COLSxROWS: - 0 0F00 80x25 - 1 0F01 80x50 - 2 0F02 80x43 - 3 0F03 80x26 - .... - Enter mode number or ``scan``: - - tells what video adapter did Linux detect --- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA -with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection -of chipsets is turned off by default as it's inherently unreliable due to -absolutely insane PC design. - -"0 0F00 80x25" means that the first menu item (the menu items are numbered -from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the -next section for a description of mode IDs). - - encourages you to enter the item number or mode ID -you wish to set and press . If the computer complains something about -"Unknown mode ID", it is trying to tell you that it isn't possible to set such -a mode. It's also possible to press only which leaves the current mode. - -The mode list usually contains a few basic modes and some VESA modes. In -case your chipset has been detected, some chipset-specific modes are shown as -well (some of these might be missing or unusable on your machine as different -BIOSes are often shipped with the same card and the mode numbers depend purely -on the VGA BIOS). - -The modes displayed on the menu are partially sorted: The list starts with -the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and -80x43), local modes (if the local modes feature is enabled), VESA modes and -finally SVGA modes for the auto-detected adapter. - -If you are not happy with the mode list offered (e.g., if you think your card -is able to do more), you can enter "scan" instead of item number / mode ID. The -program will try to ask the BIOS for all possible video mode numbers and test -what happens then. The screen will be probably flashing wildly for some time and -strange noises will be heard from inside the monitor and so on and then, really -all consistent video modes supported by your BIOS will appear (plus maybe some -``ghost modes``). If you are afraid this could damage your monitor, don't use -this function. - -After scanning, the mode ordering is a bit different: the auto-detected SVGA -modes are not listed at all and the modes revealed by ``scan`` are shown before -all VESA modes. - -Mode IDs -~~~~~~~~ - -Because of the complexity of all the video stuff, the video mode IDs -used here are also a bit complex. A video mode ID is a 16-bit number usually -expressed in a hexadecimal notation (starting with "0x"). You can set a mode -by entering its mode directly if you know it even if it isn't shown on the menu. - -The ID numbers can be divided to those regions:: - - 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use - outside the menu as this can change from boot to boot (especially if you - have used the ``scan`` feature). - - 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number - (as presented to INT 10, function 00) increased by 0x0100. - - 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by - 0x0100. All VESA modes should be autodetected and shown on the menu. - - 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. - (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, - 945=132x28 for the standard Video7 BIOS) - - 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually - by modifying one of the standard modes). Currently available: - 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) - 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA - 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) - 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) - 0x0f04 leave current video mode - 0x0f05 VGA 80x30 (480 scans, 16-point font) - 0x0f06 VGA 80x34 (480 scans, 14-point font) - 0x0f07 VGA 80x60 (480 scans, 8-point font) - 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) - - 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" - form where RR is a number of rows and CC is a number of columns. - E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. - This is the only fully portable way to refer to a non-standard mode, - but it relies on the mode being found and displayed on the menu - (remember that mode scanning is not done automatically). - - 0xff00 to 0xffff - aliases for backward compatibility: - 0xffff equivalent to 0x0f00 (standard 80x25) - 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) - -If you add 0x8000 to the mode ID, the program will try to recalculate -vertical display timing according to mode parameters, which can be used to -eliminate some annoying bugs of certain VGA BIOSes (usually those used for -cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the -end of the display. - -Options -~~~~~~~ - -Build options for arch/x86/boot/* are selected by the kernel kconfig -utility and the kernel .config file. - -VIDEO_GFX_HACK - includes special hack for setting of graphics modes -to be used later by special drivers. -Allows to set _any_ BIOS mode including graphic ones and forcing specific -text screen resolution instead of peeking it from BIOS variables. Don't use -unless you think you know what you're doing. To activate this setup, use -mode number 0x0f08 (see the Mode IDs section above). - -Still doesn't work? -~~~~~~~~~~~~~~~~~~~ - -When the mode detection doesn't work (e.g., the mode list is incorrect or -the machine hangs instead of displaying the menu), try to switch off some of -the configuration options listed under "Options". If it fails, you can still use -your kernel with the video mode set directly via the kernel parameter. - -In either case, please send me a bug report containing what _exactly_ -happens and how do the configuration switches affect the behaviour of the bug. - -If you start Linux from M$-DOS, you might also use some DOS tools for -video mode setting. In this case, you must specify the 0x0f04 mode ("leave -current settings") to Linux, because if you don't and you use any non-standard -mode, Linux will switch to 80x25 automatically. - -If you set some extended mode and there's one or more extra lines on the -bottom of the display containing already scrolled-out text, your VGA BIOS -contains the most common video BIOS bug called "incorrect vertical display -end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, -this must be done manually -- no autodetection mechanisms are available. - -History -~~~~~~~ - -=============== ================================================================ -1.0 (??-Nov-95) First version supporting all adapters supported by the old - setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels - and then removed due to instability on some machines. -2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost - everything is configurable, the VESA support should be much more - stable, explicit mode numbering allowed, "scan" implemented etc. -2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution - supported. Few bugs fixed. VESA modes are listed prior to - modes supplied by SVGA autodetection as they are more reliable. - CLGD autodetect works better. Doesn't depend on 80x25 being - active when started. Scanning fixed. 80x43 (any VGA) added. - Code cleaned up. -2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX - VESA modes work now). Display end bug workaround supported. - Special modes renumbered to allow adding of the "recalculate" - flag, 0xffff and 0xfffe became aliases instead of real IDs. - Screen contents retained during mode changes. -2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. -2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem - with some boot loaders. Memory management rewritten to reflect - these changes. Unfortunately, screen contents retaining works - only with some loaders now. - Added a Tseng 132x60 mode. -2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. -2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on - several cards with broken VESA code (e.g., ATI VGA). -2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some - cards use very strange mode numbers. - - Added Realtek VGA modes (thanks to Gonzalo Tornaria). - - Hardware testing order slightly changed, tests based on ROM - contents done as first. - - Added support for special Video7 mode switching functions - (thanks to Tom Vander Aa). - - Added 480-scanline modes (especially useful for notebooks, - original version written by hhanemaa@cs.ruu.nl, patched by - Jeff Chua, rewritten by me). - - Screen store/restore fixed. -2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. - - Better recognition of text modes during mode scan. -2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) -2.10(11-Nov-96) - The whole thing made optional. - - Added the CONFIG_VIDEO_400_HACK switch. - - Added the CONFIG_VIDEO_GFX_HACK switch. - - Code cleanup. -2.11(03-May-97) - Yet another cleanup, now including also the documentation. - - Direct testing of SVGA adapters turned off by default, ``scan`` - offered explicitly on the prompt line. - - Removed the doc section describing adding of new probing - functions as I try to get rid of _all_ hardware probing here. -2.12(25-May-98) Added support for VESA frame buffer graphics. -2.13(14-May-99) Minor documentation fixes. -=============== ================================================================ diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt deleted file mode 100644 index 56d6fa2e2368..000000000000 --- a/Documentation/video-output.txt +++ /dev/null @@ -1,34 +0,0 @@ -Video Output Switcher Control -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -2006 luming.yu@intel.com - -The output sysfs class driver provides an abstract video output layer that -can be used to hook platform specific methods to enable/disable video output -device through common sysfs interface. For example, on my IBM ThinkPad T42 -laptop, The ACPI video driver registered its output devices and read/write -method for 'state' with output sysfs class. The user interface under sysfs is:: - - linux:/sys/class/video_output # tree . - . - |-- CRT0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- DVI0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- LCD0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - `-- TV0 - |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - |-- state - |-- subsystem -> ../../../class/video_output - `-- uevent - diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst index 8e9704f61017..e29739904e37 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/x86/topology.rst @@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the respective code. The architecture-agnostic topology definitions are in -Documentation/cputopology.txt. This file holds x86-specific +Documentation/admin-guide/cputopology.rst. This file holds x86-specific differences/specialities which must not necessarily apply to the generic definitions. Thus, the way to read up on Linux topology on x86 is to start with the generic one and look at this one in parallel for the x86 specifics. -- cgit From baa293e9544bea71361950d071579f0e4d5713ed Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 15:39:22 -0300 Subject: docs: driver-api: add a series of orphaned documents There are lots of documents under Documentation/*.txt and a few other orphan documents elsehwere that belong to the driver-API book. Move them to their right place. Reviewed-by: Cornelia Huck # vfio-related parts Acked-by: Logan Gunthorpe # switchtec Signed-off-by: Mauro Carvalho Chehab --- Documentation/ABI/removed/sysfs-class-rfkill | 2 +- Documentation/ABI/stable/sysfs-class-rfkill | 2 +- Documentation/ABI/testing/sysfs-class-switchtec | 2 +- Documentation/EDID/howto.rst | 58 - Documentation/SM501.txt | 74 - Documentation/admin-guide/kernel-parameters.txt | 2 +- .../admin-guide/laptops/thinkpad-acpi.rst | 6 +- Documentation/bt8xxgpio.txt | 62 - Documentation/connector/connector.rst | 156 -- Documentation/console/console.rst | 152 -- Documentation/dcdbas.txt | 99 -- Documentation/dell_rbu.txt | 128 -- Documentation/driver-api/bt8xxgpio.rst | 62 + Documentation/driver-api/connector.rst | 156 ++ Documentation/driver-api/console.rst | 152 ++ Documentation/driver-api/dcdbas.rst | 99 ++ Documentation/driver-api/dell_rbu.rst | 128 ++ Documentation/driver-api/edid.rst | 58 + Documentation/driver-api/eisa.rst | 230 +++ Documentation/driver-api/index.rst | 26 + Documentation/driver-api/isa.rst | 122 ++ Documentation/driver-api/isapnp.rst | 15 + Documentation/driver-api/lightnvm-pblk.rst | 21 + Documentation/driver-api/men-chameleon-bus.rst | 175 ++ Documentation/driver-api/ntb.rst | 236 +++ Documentation/driver-api/nvmem.rst | 189 ++ Documentation/driver-api/parport-lowlevel.rst | 1832 ++++++++++++++++++++ Documentation/driver-api/pti_intel_mid.rst | 106 ++ Documentation/driver-api/pwm.rst | 165 ++ Documentation/driver-api/rfkill.rst | 132 ++ Documentation/driver-api/sgi-ioc4.rst | 49 + Documentation/driver-api/sm501.rst | 74 + Documentation/driver-api/smsc_ece1099.rst | 60 + Documentation/driver-api/switchtec.rst | 102 ++ Documentation/driver-api/sync_file.rst | 86 + Documentation/driver-api/vfio-mediated-device.rst | 414 +++++ Documentation/driver-api/vfio.rst | 520 ++++++ Documentation/driver-api/xillybus.rst | 379 ++++ Documentation/driver-api/zorro.rst | 104 ++ Documentation/eisa.txt | 230 --- Documentation/fb/fbcon.rst | 4 +- Documentation/isa.txt | 122 -- Documentation/isapnp.txt | 15 - Documentation/lightnvm/pblk.txt | 21 - Documentation/men-chameleon-bus.txt | 175 -- Documentation/ntb.txt | 236 --- Documentation/nvmem/nvmem.rst | 189 -- Documentation/parport-lowlevel.txt | 1832 -------------------- Documentation/pti/pti_intel_mid.rst | 106 -- Documentation/pwm.txt | 165 -- Documentation/rfkill.txt | 132 -- Documentation/s390/vfio-ccw.rst | 6 +- Documentation/sgi-ioc4.txt | 49 - Documentation/smsc_ece1099.txt | 60 - Documentation/switchtec.txt | 102 -- Documentation/sync_file.txt | 86 - Documentation/vfio-mediated-device.txt | 414 ----- Documentation/vfio.txt | 520 ------ Documentation/w1/w1.netlink | 2 +- Documentation/xillybus.txt | 379 ---- Documentation/zorro.txt | 104 -- 61 files changed, 5705 insertions(+), 5679 deletions(-) delete mode 100644 Documentation/EDID/howto.rst delete mode 100644 Documentation/SM501.txt delete mode 100644 Documentation/bt8xxgpio.txt delete mode 100644 Documentation/connector/connector.rst delete mode 100644 Documentation/console/console.rst delete mode 100644 Documentation/dcdbas.txt delete mode 100644 Documentation/dell_rbu.txt create mode 100644 Documentation/driver-api/bt8xxgpio.rst create mode 100644 Documentation/driver-api/connector.rst create mode 100644 Documentation/driver-api/console.rst create mode 100644 Documentation/driver-api/dcdbas.rst create mode 100644 Documentation/driver-api/dell_rbu.rst create mode 100644 Documentation/driver-api/edid.rst create mode 100644 Documentation/driver-api/eisa.rst create mode 100644 Documentation/driver-api/isa.rst create mode 100644 Documentation/driver-api/isapnp.rst create mode 100644 Documentation/driver-api/lightnvm-pblk.rst create mode 100644 Documentation/driver-api/men-chameleon-bus.rst create mode 100644 Documentation/driver-api/ntb.rst create mode 100644 Documentation/driver-api/nvmem.rst create mode 100644 Documentation/driver-api/parport-lowlevel.rst create mode 100644 Documentation/driver-api/pti_intel_mid.rst create mode 100644 Documentation/driver-api/pwm.rst create mode 100644 Documentation/driver-api/rfkill.rst create mode 100644 Documentation/driver-api/sgi-ioc4.rst create mode 100644 Documentation/driver-api/sm501.rst create mode 100644 Documentation/driver-api/smsc_ece1099.rst create mode 100644 Documentation/driver-api/switchtec.rst create mode 100644 Documentation/driver-api/sync_file.rst create mode 100644 Documentation/driver-api/vfio-mediated-device.rst create mode 100644 Documentation/driver-api/vfio.rst create mode 100644 Documentation/driver-api/xillybus.rst create mode 100644 Documentation/driver-api/zorro.rst delete mode 100644 Documentation/eisa.txt delete mode 100644 Documentation/isa.txt delete mode 100644 Documentation/isapnp.txt delete mode 100644 Documentation/lightnvm/pblk.txt delete mode 100644 Documentation/men-chameleon-bus.txt delete mode 100644 Documentation/ntb.txt delete mode 100644 Documentation/nvmem/nvmem.rst delete mode 100644 Documentation/parport-lowlevel.txt delete mode 100644 Documentation/pti/pti_intel_mid.rst delete mode 100644 Documentation/pwm.txt delete mode 100644 Documentation/rfkill.txt delete mode 100644 Documentation/sgi-ioc4.txt delete mode 100644 Documentation/smsc_ece1099.txt delete mode 100644 Documentation/switchtec.txt delete mode 100644 Documentation/sync_file.txt delete mode 100644 Documentation/vfio-mediated-device.txt delete mode 100644 Documentation/vfio.txt delete mode 100644 Documentation/xillybus.txt delete mode 100644 Documentation/zorro.txt (limited to 'Documentation') diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill index 3ce6231f20b2..9c08c7f98ffb 100644 --- a/Documentation/ABI/removed/sysfs-class-rfkill +++ b/Documentation/ABI/removed/sysfs-class-rfkill @@ -1,6 +1,6 @@ rfkill - radio frequency (RF) connector kill switch support -For details to this subsystem look at Documentation/rfkill.txt. +For details to this subsystem look at Documentation/driver-api/rfkill.rst. What: /sys/class/rfkill/rfkill[0-9]+/claim Date: 09-Jul-2007 diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill index 80151a409d67..5b154f922643 100644 --- a/Documentation/ABI/stable/sysfs-class-rfkill +++ b/Documentation/ABI/stable/sysfs-class-rfkill @@ -1,6 +1,6 @@ rfkill - radio frequency (RF) connector kill switch support -For details to this subsystem look at Documentation/rfkill.txt. +For details to this subsystem look at Documentation/driver-api/rfkill.rst. For the deprecated /sys/class/rfkill/*/claim knobs of this interface look in Documentation/ABI/removed/sysfs-class-rfkill. diff --git a/Documentation/ABI/testing/sysfs-class-switchtec b/Documentation/ABI/testing/sysfs-class-switchtec index 48cb4c15e430..76c7a661a595 100644 --- a/Documentation/ABI/testing/sysfs-class-switchtec +++ b/Documentation/ABI/testing/sysfs-class-switchtec @@ -1,6 +1,6 @@ switchtec - Microsemi Switchtec PCI Switch Management Endpoint -For details on this subsystem look at Documentation/switchtec.txt. +For details on this subsystem look at Documentation/driver-api/switchtec.rst. What: /sys/class/switchtec Date: 05-Jan-2017 diff --git a/Documentation/EDID/howto.rst b/Documentation/EDID/howto.rst deleted file mode 100644 index 725fd49a88ca..000000000000 --- a/Documentation/EDID/howto.rst +++ /dev/null @@ -1,58 +0,0 @@ -:orphan: - -==== -EDID -==== - -In the good old days when graphics parameters were configured explicitly -in a file called xorg.conf, even broken hardware could be managed. - -Today, with the advent of Kernel Mode Setting, a graphics board is -either correctly working because all components follow the standards - -or the computer is unusable, because the screen remains dark after -booting or it displays the wrong area. Cases when this happens are: -- The graphics board does not recognize the monitor. -- The graphics board is unable to detect any EDID data. -- The graphics board incorrectly forwards EDID data to the driver. -- The monitor sends no or bogus EDID data. -- A KVM sends its own EDID data instead of querying the connected monitor. -Adding the kernel parameter "nomodeset" helps in most cases, but causes -restrictions later on. - -As a remedy for such situations, the kernel configuration item -CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an -individually prepared or corrected EDID data set in the /lib/firmware -directory from where it is loaded via the firmware interface. The code -(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for -commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200, -1680x1050, 1920x1080) as binary blobs, but the kernel source tree does -not contain code to create these data. In order to elucidate the origin -of the built-in binary EDID blobs and to facilitate the creation of -individual data for a specific misbehaving monitor, commented sources -and a Makefile environment are given here. - -To create binary EDID and C source code files from the existing data -material, simply type "make". - -If you want to create your own EDID file, copy the file 1024x768.S, -replace the settings with your own data and add a new target to the -Makefile. Please note that the EDID data structure expects the timing -values in a different way as compared to the standard X11 format. - -X11: - HTimings: - hdisp hsyncstart hsyncend htotal - VTimings: - vdisp vsyncstart vsyncend vtotal - -EDID:: - - #define XPIX hdisp - #define XBLANK htotal-hdisp - #define XOFFSET hsyncstart-hdisp - #define XPULSE hsyncend-hsyncstart - - #define YPIX vdisp - #define YBLANK vtotal-vdisp - #define YOFFSET vsyncstart-vdisp - #define YPULSE vsyncend-vsyncstart diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt deleted file mode 100644 index 882507453ba4..000000000000 --- a/Documentation/SM501.txt +++ /dev/null @@ -1,74 +0,0 @@ -.. include:: - -============ -SM501 Driver -============ - -:Copyright: |copy| 2006, 2007 Simtec Electronics - -The Silicon Motion SM501 multimedia companion chip is a multifunction device -which may provide numerous interfaces including USB host controller USB gadget, -asynchronous serial ports, audio functions, and a dual display video interface. -The device may be connected by PCI or local bus with varying functions enabled. - -Core ----- - -The core driver in drivers/mfd provides common services for the -drivers which manage the specific hardware blocks. These services -include locking for common registers, clock control and resource -management. - -The core registers drivers for both PCI and generic bus based -chips via the platform device and driver system. - -On detection of a device, the core initialises the chip (which may -be specified by the platform data) and then exports the selected -peripheral set as platform devices for the specific drivers. - -The core re-uses the platform device system as the platform device -system provides enough features to support the drivers without the -need to create a new bus-type and the associated code to go with it. - - -Resources ---------- - -Each peripheral has a view of the device which is implicitly narrowed to -the specific set of resources that peripheral requires in order to -function correctly. - -The centralised memory allocation allows the driver to ensure that the -maximum possible resource allocation can be made to the video subsystem -as this is by-far the most resource-sensitive of the on-chip functions. - -The primary issue with memory allocation is that of moving the video -buffers once a display mode is chosen. Indeed when a video mode change -occurs the memory footprint of the video subsystem changes. - -Since video memory is difficult to move without changing the display -(unless sufficient contiguous memory can be provided for the old and new -modes simultaneously) the video driver fully utilises the memory area -given to it by aligning fb0 to the start of the area and fb1 to the end -of it. Any memory left over in the middle is used for the acceleration -functions, which are transient and thus their location is less critical -as it can be moved. - - -Configuration -------------- - -The platform device driver uses a set of platform data to pass -configurations through to the core and the subsidiary drivers -so that there can be support for more than one system carrying -an SM501 built into a single kernel image. - -The PCI driver assumes that the PCI card behaves as per the Silicon -Motion reference design. - -There is an errata (AB-5) affecting the selection of the -of the M1XCLK and M1CLK frequencies. These two clocks -must be sourced from the same PLL, although they can then -be divided down individually. If this is not set, then SM501 may -lock and hang the whole system. The driver will refuse to -attach if the PLL selection is different. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 19b1e3bef56c..04f7b537ee51 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -930,7 +930,7 @@ edid/1680x1050.bin, or edid/1920x1080.bin is given and no file with the same name exists. Details and instructions how to build your own EDID data are - available in Documentation/EDID/howto.rst. An EDID + available in Documentation/driver-api/edid.rst. An EDID data set will only be used for a particular connector, if its name and a colon are prepended to the EDID name. Each connector may use a unique EDID data diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst index 19d52fc3c5e9..adea0bf2acc5 100644 --- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -643,7 +643,7 @@ Sysfs notes 2010. rfkill controller switch "tpacpi_bluetooth_sw": refer to - Documentation/rfkill.txt for details. + Documentation/driver-api/rfkill.rst for details. Video output control -- /proc/acpi/ibm/video @@ -1406,7 +1406,7 @@ Sysfs notes 2010. rfkill controller switch "tpacpi_wwan_sw": refer to - Documentation/rfkill.txt for details. + Documentation/driver-api/rfkill.rst for details. EXPERIMENTAL: UWB @@ -1426,7 +1426,7 @@ Sysfs notes ^^^^^^^^^^^ rfkill controller switch "tpacpi_uwb_sw": refer to - Documentation/rfkill.txt for details. + Documentation/driver-api/rfkill.rst for details. Adaptive keyboard ----------------- diff --git a/Documentation/bt8xxgpio.txt b/Documentation/bt8xxgpio.txt deleted file mode 100644 index a845feb074de..000000000000 --- a/Documentation/bt8xxgpio.txt +++ /dev/null @@ -1,62 +0,0 @@ -=================================================================== -A driver for a selfmade cheap BT8xx based PCI GPIO-card (bt8xxgpio) -=================================================================== - -For advanced documentation, see http://www.bu3sch.de/btgpio.php - -A generic digital 24-port PCI GPIO card can be built out of an ordinary -Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The -Brooktree chip is used in old analog Hauppauge WinTV PCI cards. You can easily -find them used for low prices on the net. - -The bt8xx chip does have 24 digital GPIO ports. -These ports are accessible via 24 pins on the SMD chip package. - - -How to physically access the GPIO pins -====================================== - -The are several ways to access these pins. One might unsolder the whole chip -and put it on a custom PCI board, or one might only unsolder each individual -GPIO pin and solder that to some tiny wire. As the chip package really is tiny -there are some advanced soldering skills needed in any case. - -The physical pinouts are drawn in the following ASCII art. -The GPIO pins are marked with G00-G23:: - - G G G G G G G G G G G G G G G G G G - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - --------------------------------------------------------------------------- - --| ^ ^ |-- - --| pin 86 pin 67 |-- - --| |-- - --| pin 61 > |-- G18 - --| |-- G19 - --| |-- G20 - --| |-- G21 - --| |-- G22 - --| pin 56 > |-- G23 - --| |-- - --| Brooktree 878/879 |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| |-- - --| O |-- - --| |-- - --------------------------------------------------------------------------- - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - ^ - This is pin 1 - diff --git a/Documentation/connector/connector.rst b/Documentation/connector/connector.rst deleted file mode 100644 index 24e26dc22dbf..000000000000 --- a/Documentation/connector/connector.rst +++ /dev/null @@ -1,156 +0,0 @@ -:orphan: - -================ -Kernel Connector -================ - -Kernel connector - new netlink based userspace <-> kernel space easy -to use communication module. - -The Connector driver makes it easy to connect various agents using a -netlink based network. One must register a callback and an identifier. -When the driver receives a special netlink message with the appropriate -identifier, the appropriate callback will be called. - -From the userspace point of view it's quite straightforward: - - - socket(); - - bind(); - - send(); - - recv(); - -But if kernelspace wants to use the full power of such connections, the -driver writer must create special sockets, must know about struct sk_buff -handling, etc... The Connector driver allows any kernelspace agents to use -netlink based networking for inter-process communication in a significantly -easier way:: - - int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *)); - void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask); - void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask); - - struct cb_id - { - __u32 idx; - __u32 val; - }; - -idx and val are unique identifiers which must be registered in the -connector.h header for in-kernel usage. `void (*callback) (void *)` is a -callback function which will be called when a message with above idx.val -is received by the connector core. The argument for that function must -be dereferenced to `struct cn_msg *`:: - - struct cn_msg - { - struct cb_id id; - - __u32 seq; - __u32 ack; - - __u32 len; /* Length of the following data */ - __u8 data[0]; - }; - -Connector interfaces -==================== - - .. kernel-doc:: include/linux/connector.h - - Note: - When registering new callback user, connector core assigns - netlink group to the user which is equal to its id.idx. - -Protocol description -==================== - -The current framework offers a transport layer with fixed headers. The -recommended protocol which uses such a header is as following: - -msg->seq and msg->ack are used to determine message genealogy. When -someone sends a message, they use a locally unique sequence and random -acknowledge number. The sequence number may be copied into -nlmsghdr->nlmsg_seq too. - -The sequence number is incremented with each message sent. - -If you expect a reply to the message, then the sequence number in the -received message MUST be the same as in the original message, and the -acknowledge number MUST be the same + 1. - -If we receive a message and its sequence number is not equal to one we -are expecting, then it is a new message. If we receive a message and -its sequence number is the same as one we are expecting, but its -acknowledge is not equal to the sequence number in the original -message + 1, then it is a new message. - -Obviously, the protocol header contains the above id. - -The connector allows event notification in the following form: kernel -driver or userspace process can ask connector to notify it when -selected ids will be turned on or off (registered or unregistered its -callback). It is done by sending a special command to the connector -driver (it also registers itself with id={-1, -1}). - -As example of this usage can be found in the cn_test.c module which -uses the connector to request notification and to send messages. - -Reliability -=========== - -Netlink itself is not a reliable protocol. That means that messages can -be lost due to memory pressure or process' receiving queue overflowed, -so caller is warned that it must be prepared. That is why the struct -cn_msg [main connector's message header] contains u32 seq and u32 ack -fields. - -Userspace usage -=============== - -2.6.14 has a new netlink socket implementation, which by default does not -allow people to send data to netlink groups other than 1. -So, if you wish to use a netlink socket (for example using connector) -with a different group number, the userspace application must subscribe to -that group first. It can be achieved by the following pseudocode:: - - s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); - - l_local.nl_family = AF_NETLINK; - l_local.nl_groups = 12345; - l_local.nl_pid = 0; - - if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { - perror("bind"); - close(s); - return -1; - } - - { - int on = l_local.nl_groups; - setsockopt(s, 270, 1, &on, sizeof(on)); - } - -Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket -option. To drop a multicast subscription, one should call the above socket -option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. - -2.6.14 netlink code only allows to select a group which is less or equal to -the maximum group number, which is used at netlink_kernel_create() time. -In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use -group number 12345, you must increment CN_NETLINK_USERS to that number. -Additional 0xf numbers are allocated to be used by non-in-kernel users. - -Due to this limitation, group 0xffffffff does not work now, so one can -not use add/remove connector's group notifications, but as far as I know, -only cn_test.c test module used it. - -Some work in netlink area is still being done, so things can be changed in -2.6.15 timeframe, if it will happen, documentation will be updated for that -kernel. - -Code samples -============ - -Sample code for a connector test module and user space can be found -in samples/connector/. To build this code, enable CONFIG_CONNECTOR -and CONFIG_SAMPLES. diff --git a/Documentation/console/console.rst b/Documentation/console/console.rst deleted file mode 100644 index b374141b027e..000000000000 --- a/Documentation/console/console.rst +++ /dev/null @@ -1,152 +0,0 @@ -:orphan: - -=============== -Console Drivers -=============== - -The Linux kernel has 2 general types of console drivers. The first type is -assigned by the kernel to all the virtual consoles during the boot process. -This type will be called 'system driver', and only one system driver is allowed -to exist. The system driver is persistent and it can never be unloaded, though -it may become inactive. - -The second type has to be explicitly loaded and unloaded. This will be called -'modular driver' by this document. Multiple modular drivers can coexist at -any time with each driver sharing the console with other drivers including -the system driver. However, modular drivers cannot take over the console -that is currently occupied by another modular driver. (Exception: Drivers that -call do_take_over_console() will succeed in the takeover regardless of the type -of driver occupying the consoles.) They can only take over the console that is -occupied by the system driver. In the same token, if the modular driver is -released by the console, the system driver will take over. - -Modular drivers, from the programmer's point of view, have to call:: - - do_take_over_console() - load and bind driver to console layer - give_up_console() - unload driver; it will only work if driver - is fully unbound - -In newer kernels, the following are also available:: - - do_register_con_driver() - do_unregister_con_driver() - -If sysfs is enabled, the contents of /sys/class/vtconsole can be -examined. This shows the console backends currently registered by the -system which are named vtcon where is an integer from 0 to 15. -Thus:: - - ls /sys/class/vtconsole - . .. vtcon0 vtcon1 - -Each directory in /sys/class/vtconsole has 3 files:: - - ls /sys/class/vtconsole/vtcon0 - . .. bind name uevent - -What do these files signify? - - 1. bind - this is a read/write file. It shows the status of the driver if - read, or acts to bind or unbind the driver to the virtual consoles - when written to. The possible values are: - - 0 - - means the driver is not bound and if echo'ed, commands the driver - to unbind - - 1 - - means the driver is bound and if echo'ed, commands the driver to - bind - - 2. name - read-only file. Shows the name of the driver in this format:: - - cat /sys/class/vtconsole/vtcon0/name - (S) VGA+ - - '(S)' stands for a (S)ystem driver, i.e., it cannot be directly - commanded to bind or unbind - - 'VGA+' is the name of the driver - - cat /sys/class/vtconsole/vtcon1/name - (M) frame buffer device - - In this case, '(M)' stands for a (M)odular driver, one that can be - directly commanded to bind or unbind. - - 3. uevent - ignore this file - -When unbinding, the modular driver is detached first, and then the system -driver takes over the consoles vacated by the driver. Binding, on the other -hand, will bind the driver to the consoles that are currently occupied by a -system driver. - -NOTE1: - Binding and unbinding must be selected in Kconfig. It's under:: - - Device Drivers -> - Character devices -> - Support for binding and unbinding console drivers - -NOTE2: - If any of the virtual consoles are in KD_GRAPHICS mode, then binding or - unbinding will not succeed. An example of an application that sets the - console to KD_GRAPHICS is X. - -How useful is this feature? This is very useful for console driver -developers. By unbinding the driver from the console layer, one can unload the -driver, make changes, recompile, reload and rebind the driver without any need -for rebooting the kernel. For regular users who may want to switch from -framebuffer console to VGA console and vice versa, this feature also makes -this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb -for more details.) - -Notes for developers -==================== - -do_take_over_console() is now broken up into:: - - do_register_con_driver() - do_bind_con_driver() - private function - -give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must -be fully unbound for this call to succeed. con_is_bound() will check if the -driver is bound or not. - -Guidelines for console driver writers -===================================== - -In order for binding to and unbinding from the console to properly work, -console drivers must follow these guidelines: - -1. All drivers, except system drivers, must call either do_register_con_driver() - or do_take_over_console(). do_register_con_driver() will just add the driver - to the console's internal list. It won't take over the - console. do_take_over_console(), as it name implies, will also take over (or - bind to) the console. - -2. All resources allocated during con->con_init() must be released in - con->con_deinit(). - -3. All resources allocated in con->con_startup() must be released when the - driver, which was previously bound, becomes unbound. The console layer - does not have a complementary call to con->con_startup() so it's up to the - driver to check when it's legal to release these resources. Calling - con_is_bound() in con->con_deinit() will help. If the call returned - false(), then it's safe to release the resources. This balance has to be - ensured because con->con_startup() can be called again when a request to - rebind the driver to the console arrives. - -4. Upon exit of the driver, ensure that the driver is totally unbound. If the - condition is satisfied, then the driver must call do_unregister_con_driver() - or give_up_console(). - -5. do_unregister_con_driver() can also be called on conditions which make it - impossible for the driver to service console requests. This can happen - with the framebuffer console that suddenly lost all of its drivers. - -The current crop of console drivers should still work correctly, but binding -and unbinding them may cause problems. With minimal fixes, these drivers can -be made to work correctly. - -Antonino Daplas diff --git a/Documentation/dcdbas.txt b/Documentation/dcdbas.txt deleted file mode 100644 index 309cc57a7c1c..000000000000 --- a/Documentation/dcdbas.txt +++ /dev/null @@ -1,99 +0,0 @@ -=================================== -Dell Systems Management Base Driver -=================================== - -Overview -======== - -The Dell Systems Management Base Driver provides a sysfs interface for -systems management software such as Dell OpenManage to perform system -management interrupts and host control actions (system power cycle or -power off after OS shutdown) on certain Dell systems. - -Dell OpenManage requires this driver on the following Dell PowerEdge systems: -300, 1300, 1400, 400SC, 500SC, 1500SC, 1550, 600SC, 1600SC, 650, 1655MC, -700, and 750. Other Dell software such as the open source libsmbios project -is expected to make use of this driver, and it may include the use of this -driver on other Dell systems. - -The Dell libsmbios project aims towards providing access to as much BIOS -information as possible. See http://linux.dell.com/libsmbios/main/ for -more information about the libsmbios project. - - -System Management Interrupt -=========================== - -On some Dell systems, systems management software must access certain -management information via a system management interrupt (SMI). The SMI data -buffer must reside in 32-bit address space, and the physical address of the -buffer is required for the SMI. The driver maintains the memory required for -the SMI and provides a way for the application to generate the SMI. -The driver creates the following sysfs entries for systems management -software to perform these system management interrupts:: - - /sys/devices/platform/dcdbas/smi_data - /sys/devices/platform/dcdbas/smi_data_buf_phys_addr - /sys/devices/platform/dcdbas/smi_data_buf_size - /sys/devices/platform/dcdbas/smi_request - -Systems management software must perform the following steps to execute -a SMI using this driver: - -1) Lock smi_data. -2) Write system management command to smi_data. -3) Write "1" to smi_request to generate a calling interface SMI or - "2" to generate a raw SMI. -4) Read system management command response from smi_data. -5) Unlock smi_data. - - -Host Control Action -=================== - -Dell OpenManage supports a host control feature that allows the administrator -to perform a power cycle or power off of the system after the OS has finished -shutting down. On some Dell systems, this host control feature requires that -a driver perform a SMI after the OS has finished shutting down. - -The driver creates the following sysfs entries for systems management software -to schedule the driver to perform a power cycle or power off host control -action after the system has finished shutting down: - -/sys/devices/platform/dcdbas/host_control_action -/sys/devices/platform/dcdbas/host_control_smi_type -/sys/devices/platform/dcdbas/host_control_on_shutdown - -Dell OpenManage performs the following steps to execute a power cycle or -power off host control action using this driver: - -1) Write host control action to be performed to host_control_action. -2) Write type of SMI that driver needs to perform to host_control_smi_type. -3) Write "1" to host_control_on_shutdown to enable host control action. -4) Initiate OS shutdown. - (Driver will perform host control SMI when it is notified that the OS - has finished shutting down.) - - -Host Control SMI Type -===================== - -The following table shows the value to write to host_control_smi_type to -perform a power cycle or power off host control action: - -=================== ===================== -PowerEdge System Host Control SMI Type -=================== ===================== - 300 HC_SMITYPE_TYPE1 - 1300 HC_SMITYPE_TYPE1 - 1400 HC_SMITYPE_TYPE2 - 500SC HC_SMITYPE_TYPE2 - 1500SC HC_SMITYPE_TYPE2 - 1550 HC_SMITYPE_TYPE2 - 600SC HC_SMITYPE_TYPE2 - 1600SC HC_SMITYPE_TYPE2 - 650 HC_SMITYPE_TYPE2 - 1655MC HC_SMITYPE_TYPE2 - 700 HC_SMITYPE_TYPE3 - 750 HC_SMITYPE_TYPE3 -=================== ===================== diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt deleted file mode 100644 index 5d1ce7bcd04d..000000000000 --- a/Documentation/dell_rbu.txt +++ /dev/null @@ -1,128 +0,0 @@ -============================================================= -Usage of the new open sourced rbu (Remote BIOS Update) driver -============================================================= - -Purpose -======= - -Document demonstrating the use of the Dell Remote BIOS Update driver. -for updating BIOS images on Dell servers and desktops. - -Scope -===== - -This document discusses the functionality of the rbu driver only. -It does not cover the support needed from applications to enable the BIOS to -update itself with the image downloaded in to the memory. - -Overview -======== - -This driver works with Dell OpenManage or Dell Update Packages for updating -the BIOS on Dell servers (starting from servers sold since 1999), desktops -and notebooks (starting from those sold in 2005). - -Please go to http://support.dell.com register and you can find info on -OpenManage and Dell Update packages (DUP). - -Libsmbios can also be used to update BIOS on Dell systems go to -http://linux.dell.com/libsmbios/ for details. - -Dell_RBU driver supports BIOS update using the monolithic image and packetized -image methods. In case of monolithic the driver allocates a contiguous chunk -of physical pages having the BIOS image. In case of packetized the app -using the driver breaks the image in to packets of fixed sizes and the driver -would place each packet in contiguous physical memory. The driver also -maintains a link list of packets for reading them back. - -If the dell_rbu driver is unloaded all the allocated memory is freed. - -The rbu driver needs to have an application (as mentioned above)which will -inform the BIOS to enable the update in the next system reboot. - -The user should not unload the rbu driver after downloading the BIOS image -or updating. - -The driver load creates the following directories under the /sys file system:: - - /sys/class/firmware/dell_rbu/loading - /sys/class/firmware/dell_rbu/data - /sys/devices/platform/dell_rbu/image_type - /sys/devices/platform/dell_rbu/data - /sys/devices/platform/dell_rbu/packet_size - -The driver supports two types of update mechanism; monolithic and packetized. -These update mechanism depends upon the BIOS currently running on the system. -Most of the Dell systems support a monolithic update where the BIOS image is -copied to a single contiguous block of physical memory. - -In case of packet mechanism the single memory can be broken in smaller chunks -of contiguous memory and the BIOS image is scattered in these packets. - -By default the driver uses monolithic memory for the update type. This can be -changed to packets during the driver load time by specifying the load -parameter image_type=packet. This can also be changed later as below:: - - echo packet > /sys/devices/platform/dell_rbu/image_type - -In packet update mode the packet size has to be given before any packets can -be downloaded. It is done as below:: - - echo XXXX > /sys/devices/platform/dell_rbu/packet_size - -In the packet update mechanism, the user needs to create a new file having -packets of data arranged back to back. It can be done as follows -The user creates packets header, gets the chunk of the BIOS image and -places it next to the packetheader; now, the packetheader + BIOS image chunk -added together should match the specified packet_size. This makes one -packet, the user needs to create more such packets out of the entire BIOS -image file and then arrange all these packets back to back in to one single -file. - -This file is then copied to /sys/class/firmware/dell_rbu/data. -Once this file gets to the driver, the driver extracts packet_size data from -the file and spreads it across the physical memory in contiguous packet_sized -space. - -This method makes sure that all the packets get to the driver in a single operation. - -In monolithic update the user simply get the BIOS image (.hdr file) and copies -to the data file as is without any change to the BIOS image itself. - -Do the steps below to download the BIOS image. - -1) echo 1 > /sys/class/firmware/dell_rbu/loading -2) cp bios_image.hdr /sys/class/firmware/dell_rbu/data -3) echo 0 > /sys/class/firmware/dell_rbu/loading - -The /sys/class/firmware/dell_rbu/ entries will remain till the following is -done. - -:: - - echo -1 > /sys/class/firmware/dell_rbu/loading - -Until this step is completed the driver cannot be unloaded. - -Also echoing either mono, packet or init in to image_type will free up the -memory allocated by the driver. - -If a user by accident executes steps 1 and 3 above without executing step 2; -it will make the /sys/class/firmware/dell_rbu/ entries disappear. - -The entries can be recreated by doing the following:: - - echo init > /sys/devices/platform/dell_rbu/image_type - -.. note:: echoing init in image_type does not change it original value. - -Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to -read back the image downloaded. - -.. note:: - - After updating the BIOS image a user mode application needs to execute - code which sends the BIOS update request to the BIOS. So on the next reboot - the BIOS knows about the new image downloaded and it updates itself. - Also don't unload the rbu driver if the image has to be updated. - diff --git a/Documentation/driver-api/bt8xxgpio.rst b/Documentation/driver-api/bt8xxgpio.rst new file mode 100644 index 000000000000..a845feb074de --- /dev/null +++ b/Documentation/driver-api/bt8xxgpio.rst @@ -0,0 +1,62 @@ +=================================================================== +A driver for a selfmade cheap BT8xx based PCI GPIO-card (bt8xxgpio) +=================================================================== + +For advanced documentation, see http://www.bu3sch.de/btgpio.php + +A generic digital 24-port PCI GPIO card can be built out of an ordinary +Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The +Brooktree chip is used in old analog Hauppauge WinTV PCI cards. You can easily +find them used for low prices on the net. + +The bt8xx chip does have 24 digital GPIO ports. +These ports are accessible via 24 pins on the SMD chip package. + + +How to physically access the GPIO pins +====================================== + +The are several ways to access these pins. One might unsolder the whole chip +and put it on a custom PCI board, or one might only unsolder each individual +GPIO pin and solder that to some tiny wire. As the chip package really is tiny +there are some advanced soldering skills needed in any case. + +The physical pinouts are drawn in the following ASCII art. +The GPIO pins are marked with G00-G23:: + + G G G G G G G G G G G G G G G G G G + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + --------------------------------------------------------------------------- + --| ^ ^ |-- + --| pin 86 pin 67 |-- + --| |-- + --| pin 61 > |-- G18 + --| |-- G19 + --| |-- G20 + --| |-- G21 + --| |-- G22 + --| pin 56 > |-- G23 + --| |-- + --| Brooktree 878/879 |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| |-- + --| O |-- + --| |-- + --------------------------------------------------------------------------- + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + ^ + This is pin 1 + diff --git a/Documentation/driver-api/connector.rst b/Documentation/driver-api/connector.rst new file mode 100644 index 000000000000..c100c7482289 --- /dev/null +++ b/Documentation/driver-api/connector.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +Kernel Connector +================ + +Kernel connector - new netlink based userspace <-> kernel space easy +to use communication module. + +The Connector driver makes it easy to connect various agents using a +netlink based network. One must register a callback and an identifier. +When the driver receives a special netlink message with the appropriate +identifier, the appropriate callback will be called. + +From the userspace point of view it's quite straightforward: + + - socket(); + - bind(); + - send(); + - recv(); + +But if kernelspace wants to use the full power of such connections, the +driver writer must create special sockets, must know about struct sk_buff +handling, etc... The Connector driver allows any kernelspace agents to use +netlink based networking for inter-process communication in a significantly +easier way:: + + int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *)); + void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask); + void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask); + + struct cb_id + { + __u32 idx; + __u32 val; + }; + +idx and val are unique identifiers which must be registered in the +connector.h header for in-kernel usage. `void (*callback) (void *)` is a +callback function which will be called when a message with above idx.val +is received by the connector core. The argument for that function must +be dereferenced to `struct cn_msg *`:: + + struct cn_msg + { + struct cb_id id; + + __u32 seq; + __u32 ack; + + __u32 len; /* Length of the following data */ + __u8 data[0]; + }; + +Connector interfaces +==================== + + .. kernel-doc:: include/linux/connector.h + + Note: + When registering new callback user, connector core assigns + netlink group to the user which is equal to its id.idx. + +Protocol description +==================== + +The current framework offers a transport layer with fixed headers. The +recommended protocol which uses such a header is as following: + +msg->seq and msg->ack are used to determine message genealogy. When +someone sends a message, they use a locally unique sequence and random +acknowledge number. The sequence number may be copied into +nlmsghdr->nlmsg_seq too. + +The sequence number is incremented with each message sent. + +If you expect a reply to the message, then the sequence number in the +received message MUST be the same as in the original message, and the +acknowledge number MUST be the same + 1. + +If we receive a message and its sequence number is not equal to one we +are expecting, then it is a new message. If we receive a message and +its sequence number is the same as one we are expecting, but its +acknowledge is not equal to the sequence number in the original +message + 1, then it is a new message. + +Obviously, the protocol header contains the above id. + +The connector allows event notification in the following form: kernel +driver or userspace process can ask connector to notify it when +selected ids will be turned on or off (registered or unregistered its +callback). It is done by sending a special command to the connector +driver (it also registers itself with id={-1, -1}). + +As example of this usage can be found in the cn_test.c module which +uses the connector to request notification and to send messages. + +Reliability +=========== + +Netlink itself is not a reliable protocol. That means that messages can +be lost due to memory pressure or process' receiving queue overflowed, +so caller is warned that it must be prepared. That is why the struct +cn_msg [main connector's message header] contains u32 seq and u32 ack +fields. + +Userspace usage +=============== + +2.6.14 has a new netlink socket implementation, which by default does not +allow people to send data to netlink groups other than 1. +So, if you wish to use a netlink socket (for example using connector) +with a different group number, the userspace application must subscribe to +that group first. It can be achieved by the following pseudocode:: + + s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); + + l_local.nl_family = AF_NETLINK; + l_local.nl_groups = 12345; + l_local.nl_pid = 0; + + if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { + perror("bind"); + close(s); + return -1; + } + + { + int on = l_local.nl_groups; + setsockopt(s, 270, 1, &on, sizeof(on)); + } + +Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket +option. To drop a multicast subscription, one should call the above socket +option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. + +2.6.14 netlink code only allows to select a group which is less or equal to +the maximum group number, which is used at netlink_kernel_create() time. +In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use +group number 12345, you must increment CN_NETLINK_USERS to that number. +Additional 0xf numbers are allocated to be used by non-in-kernel users. + +Due to this limitation, group 0xffffffff does not work now, so one can +not use add/remove connector's group notifications, but as far as I know, +only cn_test.c test module used it. + +Some work in netlink area is still being done, so things can be changed in +2.6.15 timeframe, if it will happen, documentation will be updated for that +kernel. + +Code samples +============ + +Sample code for a connector test module and user space can be found +in samples/connector/. To build this code, enable CONFIG_CONNECTOR +and CONFIG_SAMPLES. diff --git a/Documentation/driver-api/console.rst b/Documentation/driver-api/console.rst new file mode 100644 index 000000000000..8394ad7747ac --- /dev/null +++ b/Documentation/driver-api/console.rst @@ -0,0 +1,152 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Console Drivers +=============== + +The Linux kernel has 2 general types of console drivers. The first type is +assigned by the kernel to all the virtual consoles during the boot process. +This type will be called 'system driver', and only one system driver is allowed +to exist. The system driver is persistent and it can never be unloaded, though +it may become inactive. + +The second type has to be explicitly loaded and unloaded. This will be called +'modular driver' by this document. Multiple modular drivers can coexist at +any time with each driver sharing the console with other drivers including +the system driver. However, modular drivers cannot take over the console +that is currently occupied by another modular driver. (Exception: Drivers that +call do_take_over_console() will succeed in the takeover regardless of the type +of driver occupying the consoles.) They can only take over the console that is +occupied by the system driver. In the same token, if the modular driver is +released by the console, the system driver will take over. + +Modular drivers, from the programmer's point of view, have to call:: + + do_take_over_console() - load and bind driver to console layer + give_up_console() - unload driver; it will only work if driver + is fully unbound + +In newer kernels, the following are also available:: + + do_register_con_driver() + do_unregister_con_driver() + +If sysfs is enabled, the contents of /sys/class/vtconsole can be +examined. This shows the console backends currently registered by the +system which are named vtcon where is an integer from 0 to 15. +Thus:: + + ls /sys/class/vtconsole + . .. vtcon0 vtcon1 + +Each directory in /sys/class/vtconsole has 3 files:: + + ls /sys/class/vtconsole/vtcon0 + . .. bind name uevent + +What do these files signify? + + 1. bind - this is a read/write file. It shows the status of the driver if + read, or acts to bind or unbind the driver to the virtual consoles + when written to. The possible values are: + + 0 + - means the driver is not bound and if echo'ed, commands the driver + to unbind + + 1 + - means the driver is bound and if echo'ed, commands the driver to + bind + + 2. name - read-only file. Shows the name of the driver in this format:: + + cat /sys/class/vtconsole/vtcon0/name + (S) VGA+ + + '(S)' stands for a (S)ystem driver, i.e., it cannot be directly + commanded to bind or unbind + + 'VGA+' is the name of the driver + + cat /sys/class/vtconsole/vtcon1/name + (M) frame buffer device + + In this case, '(M)' stands for a (M)odular driver, one that can be + directly commanded to bind or unbind. + + 3. uevent - ignore this file + +When unbinding, the modular driver is detached first, and then the system +driver takes over the consoles vacated by the driver. Binding, on the other +hand, will bind the driver to the consoles that are currently occupied by a +system driver. + +NOTE1: + Binding and unbinding must be selected in Kconfig. It's under:: + + Device Drivers -> + Character devices -> + Support for binding and unbinding console drivers + +NOTE2: + If any of the virtual consoles are in KD_GRAPHICS mode, then binding or + unbinding will not succeed. An example of an application that sets the + console to KD_GRAPHICS is X. + +How useful is this feature? This is very useful for console driver +developers. By unbinding the driver from the console layer, one can unload the +driver, make changes, recompile, reload and rebind the driver without any need +for rebooting the kernel. For regular users who may want to switch from +framebuffer console to VGA console and vice versa, this feature also makes +this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb +for more details.) + +Notes for developers +==================== + +do_take_over_console() is now broken up into:: + + do_register_con_driver() + do_bind_con_driver() - private function + +give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must +be fully unbound for this call to succeed. con_is_bound() will check if the +driver is bound or not. + +Guidelines for console driver writers +===================================== + +In order for binding to and unbinding from the console to properly work, +console drivers must follow these guidelines: + +1. All drivers, except system drivers, must call either do_register_con_driver() + or do_take_over_console(). do_register_con_driver() will just add the driver + to the console's internal list. It won't take over the + console. do_take_over_console(), as it name implies, will also take over (or + bind to) the console. + +2. All resources allocated during con->con_init() must be released in + con->con_deinit(). + +3. All resources allocated in con->con_startup() must be released when the + driver, which was previously bound, becomes unbound. The console layer + does not have a complementary call to con->con_startup() so it's up to the + driver to check when it's legal to release these resources. Calling + con_is_bound() in con->con_deinit() will help. If the call returned + false(), then it's safe to release the resources. This balance has to be + ensured because con->con_startup() can be called again when a request to + rebind the driver to the console arrives. + +4. Upon exit of the driver, ensure that the driver is totally unbound. If the + condition is satisfied, then the driver must call do_unregister_con_driver() + or give_up_console(). + +5. do_unregister_con_driver() can also be called on conditions which make it + impossible for the driver to service console requests. This can happen + with the framebuffer console that suddenly lost all of its drivers. + +The current crop of console drivers should still work correctly, but binding +and unbinding them may cause problems. With minimal fixes, these drivers can +be made to work correctly. + +Antonino Daplas diff --git a/Documentation/driver-api/dcdbas.rst b/Documentation/driver-api/dcdbas.rst new file mode 100644 index 000000000000..309cc57a7c1c --- /dev/null +++ b/Documentation/driver-api/dcdbas.rst @@ -0,0 +1,99 @@ +=================================== +Dell Systems Management Base Driver +=================================== + +Overview +======== + +The Dell Systems Management Base Driver provides a sysfs interface for +systems management software such as Dell OpenManage to perform system +management interrupts and host control actions (system power cycle or +power off after OS shutdown) on certain Dell systems. + +Dell OpenManage requires this driver on the following Dell PowerEdge systems: +300, 1300, 1400, 400SC, 500SC, 1500SC, 1550, 600SC, 1600SC, 650, 1655MC, +700, and 750. Other Dell software such as the open source libsmbios project +is expected to make use of this driver, and it may include the use of this +driver on other Dell systems. + +The Dell libsmbios project aims towards providing access to as much BIOS +information as possible. See http://linux.dell.com/libsmbios/main/ for +more information about the libsmbios project. + + +System Management Interrupt +=========================== + +On some Dell systems, systems management software must access certain +management information via a system management interrupt (SMI). The SMI data +buffer must reside in 32-bit address space, and the physical address of the +buffer is required for the SMI. The driver maintains the memory required for +the SMI and provides a way for the application to generate the SMI. +The driver creates the following sysfs entries for systems management +software to perform these system management interrupts:: + + /sys/devices/platform/dcdbas/smi_data + /sys/devices/platform/dcdbas/smi_data_buf_phys_addr + /sys/devices/platform/dcdbas/smi_data_buf_size + /sys/devices/platform/dcdbas/smi_request + +Systems management software must perform the following steps to execute +a SMI using this driver: + +1) Lock smi_data. +2) Write system management command to smi_data. +3) Write "1" to smi_request to generate a calling interface SMI or + "2" to generate a raw SMI. +4) Read system management command response from smi_data. +5) Unlock smi_data. + + +Host Control Action +=================== + +Dell OpenManage supports a host control feature that allows the administrator +to perform a power cycle or power off of the system after the OS has finished +shutting down. On some Dell systems, this host control feature requires that +a driver perform a SMI after the OS has finished shutting down. + +The driver creates the following sysfs entries for systems management software +to schedule the driver to perform a power cycle or power off host control +action after the system has finished shutting down: + +/sys/devices/platform/dcdbas/host_control_action +/sys/devices/platform/dcdbas/host_control_smi_type +/sys/devices/platform/dcdbas/host_control_on_shutdown + +Dell OpenManage performs the following steps to execute a power cycle or +power off host control action using this driver: + +1) Write host control action to be performed to host_control_action. +2) Write type of SMI that driver needs to perform to host_control_smi_type. +3) Write "1" to host_control_on_shutdown to enable host control action. +4) Initiate OS shutdown. + (Driver will perform host control SMI when it is notified that the OS + has finished shutting down.) + + +Host Control SMI Type +===================== + +The following table shows the value to write to host_control_smi_type to +perform a power cycle or power off host control action: + +=================== ===================== +PowerEdge System Host Control SMI Type +=================== ===================== + 300 HC_SMITYPE_TYPE1 + 1300 HC_SMITYPE_TYPE1 + 1400 HC_SMITYPE_TYPE2 + 500SC HC_SMITYPE_TYPE2 + 1500SC HC_SMITYPE_TYPE2 + 1550 HC_SMITYPE_TYPE2 + 600SC HC_SMITYPE_TYPE2 + 1600SC HC_SMITYPE_TYPE2 + 650 HC_SMITYPE_TYPE2 + 1655MC HC_SMITYPE_TYPE2 + 700 HC_SMITYPE_TYPE3 + 750 HC_SMITYPE_TYPE3 +=================== ===================== diff --git a/Documentation/driver-api/dell_rbu.rst b/Documentation/driver-api/dell_rbu.rst new file mode 100644 index 000000000000..5d1ce7bcd04d --- /dev/null +++ b/Documentation/driver-api/dell_rbu.rst @@ -0,0 +1,128 @@ +============================================================= +Usage of the new open sourced rbu (Remote BIOS Update) driver +============================================================= + +Purpose +======= + +Document demonstrating the use of the Dell Remote BIOS Update driver. +for updating BIOS images on Dell servers and desktops. + +Scope +===== + +This document discusses the functionality of the rbu driver only. +It does not cover the support needed from applications to enable the BIOS to +update itself with the image downloaded in to the memory. + +Overview +======== + +This driver works with Dell OpenManage or Dell Update Packages for updating +the BIOS on Dell servers (starting from servers sold since 1999), desktops +and notebooks (starting from those sold in 2005). + +Please go to http://support.dell.com register and you can find info on +OpenManage and Dell Update packages (DUP). + +Libsmbios can also be used to update BIOS on Dell systems go to +http://linux.dell.com/libsmbios/ for details. + +Dell_RBU driver supports BIOS update using the monolithic image and packetized +image methods. In case of monolithic the driver allocates a contiguous chunk +of physical pages having the BIOS image. In case of packetized the app +using the driver breaks the image in to packets of fixed sizes and the driver +would place each packet in contiguous physical memory. The driver also +maintains a link list of packets for reading them back. + +If the dell_rbu driver is unloaded all the allocated memory is freed. + +The rbu driver needs to have an application (as mentioned above)which will +inform the BIOS to enable the update in the next system reboot. + +The user should not unload the rbu driver after downloading the BIOS image +or updating. + +The driver load creates the following directories under the /sys file system:: + + /sys/class/firmware/dell_rbu/loading + /sys/class/firmware/dell_rbu/data + /sys/devices/platform/dell_rbu/image_type + /sys/devices/platform/dell_rbu/data + /sys/devices/platform/dell_rbu/packet_size + +The driver supports two types of update mechanism; monolithic and packetized. +These update mechanism depends upon the BIOS currently running on the system. +Most of the Dell systems support a monolithic update where the BIOS image is +copied to a single contiguous block of physical memory. + +In case of packet mechanism the single memory can be broken in smaller chunks +of contiguous memory and the BIOS image is scattered in these packets. + +By default the driver uses monolithic memory for the update type. This can be +changed to packets during the driver load time by specifying the load +parameter image_type=packet. This can also be changed later as below:: + + echo packet > /sys/devices/platform/dell_rbu/image_type + +In packet update mode the packet size has to be given before any packets can +be downloaded. It is done as below:: + + echo XXXX > /sys/devices/platform/dell_rbu/packet_size + +In the packet update mechanism, the user needs to create a new file having +packets of data arranged back to back. It can be done as follows +The user creates packets header, gets the chunk of the BIOS image and +places it next to the packetheader; now, the packetheader + BIOS image chunk +added together should match the specified packet_size. This makes one +packet, the user needs to create more such packets out of the entire BIOS +image file and then arrange all these packets back to back in to one single +file. + +This file is then copied to /sys/class/firmware/dell_rbu/data. +Once this file gets to the driver, the driver extracts packet_size data from +the file and spreads it across the physical memory in contiguous packet_sized +space. + +This method makes sure that all the packets get to the driver in a single operation. + +In monolithic update the user simply get the BIOS image (.hdr file) and copies +to the data file as is without any change to the BIOS image itself. + +Do the steps below to download the BIOS image. + +1) echo 1 > /sys/class/firmware/dell_rbu/loading +2) cp bios_image.hdr /sys/class/firmware/dell_rbu/data +3) echo 0 > /sys/class/firmware/dell_rbu/loading + +The /sys/class/firmware/dell_rbu/ entries will remain till the following is +done. + +:: + + echo -1 > /sys/class/firmware/dell_rbu/loading + +Until this step is completed the driver cannot be unloaded. + +Also echoing either mono, packet or init in to image_type will free up the +memory allocated by the driver. + +If a user by accident executes steps 1 and 3 above without executing step 2; +it will make the /sys/class/firmware/dell_rbu/ entries disappear. + +The entries can be recreated by doing the following:: + + echo init > /sys/devices/platform/dell_rbu/image_type + +.. note:: echoing init in image_type does not change it original value. + +Also the driver provides /sys/devices/platform/dell_rbu/data readonly file to +read back the image downloaded. + +.. note:: + + After updating the BIOS image a user mode application needs to execute + code which sends the BIOS update request to the BIOS. So on the next reboot + the BIOS knows about the new image downloaded and it updates itself. + Also don't unload the rbu driver if the image has to be updated. + diff --git a/Documentation/driver-api/edid.rst b/Documentation/driver-api/edid.rst new file mode 100644 index 000000000000..b1b5acd501ed --- /dev/null +++ b/Documentation/driver-api/edid.rst @@ -0,0 +1,58 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +EDID +==== + +In the good old days when graphics parameters were configured explicitly +in a file called xorg.conf, even broken hardware could be managed. + +Today, with the advent of Kernel Mode Setting, a graphics board is +either correctly working because all components follow the standards - +or the computer is unusable, because the screen remains dark after +booting or it displays the wrong area. Cases when this happens are: +- The graphics board does not recognize the monitor. +- The graphics board is unable to detect any EDID data. +- The graphics board incorrectly forwards EDID data to the driver. +- The monitor sends no or bogus EDID data. +- A KVM sends its own EDID data instead of querying the connected monitor. +Adding the kernel parameter "nomodeset" helps in most cases, but causes +restrictions later on. + +As a remedy for such situations, the kernel configuration item +CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an +individually prepared or corrected EDID data set in the /lib/firmware +directory from where it is loaded via the firmware interface. The code +(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for +commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200, +1680x1050, 1920x1080) as binary blobs, but the kernel source tree does +not contain code to create these data. In order to elucidate the origin +of the built-in binary EDID blobs and to facilitate the creation of +individual data for a specific misbehaving monitor, commented sources +and a Makefile environment are given here. + +To create binary EDID and C source code files from the existing data +material, simply type "make". + +If you want to create your own EDID file, copy the file 1024x768.S, +replace the settings with your own data and add a new target to the +Makefile. Please note that the EDID data structure expects the timing +values in a different way as compared to the standard X11 format. + +X11: + HTimings: + hdisp hsyncstart hsyncend htotal + VTimings: + vdisp vsyncstart vsyncend vtotal + +EDID:: + + #define XPIX hdisp + #define XBLANK htotal-hdisp + #define XOFFSET hsyncstart-hdisp + #define XPULSE hsyncend-hsyncstart + + #define YPIX vdisp + #define YBLANK vtotal-vdisp + #define YOFFSET vsyncstart-vdisp + #define YPULSE vsyncend-vsyncstart diff --git a/Documentation/driver-api/eisa.rst b/Documentation/driver-api/eisa.rst new file mode 100644 index 000000000000..c07565ba57da --- /dev/null +++ b/Documentation/driver-api/eisa.rst @@ -0,0 +1,230 @@ +================ +EISA bus support +================ + +:Author: Marc Zyngier + +This document groups random notes about porting EISA drivers to the +new EISA/sysfs API. + +Starting from version 2.5.59, the EISA bus is almost given the same +status as other much more mainstream busses such as PCI or USB. This +has been possible through sysfs, which defines a nice enough set of +abstractions to manage busses, devices and drivers. + +Although the new API is quite simple to use, converting existing +drivers to the new infrastructure is not an easy task (mostly because +detection code is generally also used to probe ISA cards). Moreover, +most EISA drivers are among the oldest Linux drivers so, as you can +imagine, some dust has settled here over the years. + +The EISA infrastructure is made up of three parts: + + - The bus code implements most of the generic code. It is shared + among all the architectures that the EISA code runs on. It + implements bus probing (detecting EISA cards available on the bus), + allocates I/O resources, allows fancy naming through sysfs, and + offers interfaces for driver to register. + + - The bus root driver implements the glue between the bus hardware + and the generic bus code. It is responsible for discovering the + device implementing the bus, and setting it up to be latter probed + by the bus code. This can go from something as simple as reserving + an I/O region on x86, to the rather more complex, like the hppa + EISA code. This is the part to implement in order to have EISA + running on an "new" platform. + + - The driver offers the bus a list of devices that it manages, and + implements the necessary callbacks to probe and release devices + whenever told to. + +Every function/structure below lives in , which depends +heavily on . + +Bus root driver +=============== + +:: + + int eisa_root_register (struct eisa_root_device *root); + +The eisa_root_register function is used to declare a device as the +root of an EISA bus. The eisa_root_device structure holds a reference +to this device, as well as some parameters for probing purposes:: + + struct eisa_root_device { + struct device *dev; /* Pointer to bridge device */ + struct resource *res; + unsigned long bus_base_addr; + int slots; /* Max slot number */ + int force_probe; /* Probe even when no slot 0 */ + u64 dma_mask; /* from bridge device */ + int bus_nr; /* Set by eisa_root_register */ + struct resource eisa_root_res; /* ditto */ + }; + +============= ====================================================== +node used for eisa_root_register internal purpose +dev pointer to the root device +res root device I/O resource +bus_base_addr slot 0 address on this bus +slots max slot number to probe +force_probe Probe even when slot 0 is empty (no EISA mainboard) +dma_mask Default DMA mask. Usually the bridge device dma_mask. +bus_nr unique bus id, set by eisa_root_register +============= ====================================================== + +Driver +====== + +:: + + int eisa_driver_register (struct eisa_driver *edrv); + void eisa_driver_unregister (struct eisa_driver *edrv); + +Clear enough ? + +:: + + struct eisa_device_id { + char sig[EISA_SIG_LEN]; + unsigned long driver_data; + }; + + struct eisa_driver { + const struct eisa_device_id *id_table; + struct device_driver driver; + }; + +=============== ==================================================== +id_table an array of NULL terminated EISA id strings, + followed by an empty string. Each string can + optionally be paired with a driver-dependent value + (driver_data). + +driver a generic driver, such as described in + Documentation/driver-api/driver-model/driver.rst. Only .name, + .probe and .remove members are mandatory. +=============== ==================================================== + +An example is the 3c59x driver:: + + static struct eisa_device_id vortex_eisa_ids[] = { + { "TCM5920", EISA_3C592_OFFSET }, + { "TCM5970", EISA_3C597_OFFSET }, + { "" } + }; + + static struct eisa_driver vortex_eisa_driver = { + .id_table = vortex_eisa_ids, + .driver = { + .name = "3c59x", + .probe = vortex_eisa_probe, + .remove = vortex_eisa_remove + } + }; + +Device +====== + +The sysfs framework calls .probe and .remove functions upon device +discovery and removal (note that the .remove function is only called +when driver is built as a module). + +Both functions are passed a pointer to a 'struct device', which is +encapsulated in a 'struct eisa_device' described as follows:: + + struct eisa_device { + struct eisa_device_id id; + int slot; + int state; + unsigned long base_addr; + struct resource res[EISA_MAX_RESOURCES]; + u64 dma_mask; + struct device dev; /* generic device */ + }; + +======== ============================================================ +id EISA id, as read from device. id.driver_data is set from the + matching driver EISA id. +slot slot number which the device was detected on +state set of flags indicating the state of the device. Current + flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED. +res set of four 256 bytes I/O regions allocated to this device +dma_mask DMA mask set from the parent device. +dev generic device (see Documentation/driver-api/driver-model/device.rst) +======== ============================================================ + +You can get the 'struct eisa_device' from 'struct device' using the +'to_eisa_device' macro. + +Misc stuff +========== + +:: + + void eisa_set_drvdata (struct eisa_device *edev, void *data); + +Stores data into the device's driver_data area. + +:: + + void *eisa_get_drvdata (struct eisa_device *edev): + +Gets the pointer previously stored into the device's driver_data area. + +:: + + int eisa_get_region_index (void *addr); + +Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given +address. + +Kernel parameters +================= + +eisa_bus.enable_dev + A comma-separated list of slots to be enabled, even if the firmware + set the card as disabled. The driver must be able to properly + initialize the device in such conditions. + +eisa_bus.disable_dev + A comma-separated list of slots to be enabled, even if the firmware + set the card as enabled. The driver won't be called to handle this + device. + +virtual_root.force_probe + Force the probing code to probe EISA slots even when it cannot find an + EISA compliant mainboard (nothing appears on slot 0). Defaults to 0 + (don't force), and set to 1 (force probing) when either + CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. + +Random notes +============ + +Converting an EISA driver to the new API mostly involves *deleting* +code (since probing is now in the core EISA code). Unfortunately, most +drivers share their probing routine between ISA, and EISA. Special +care must be taken when ripping out the EISA code, so other busses +won't suffer from these surgical strikes... + +You *must not* expect any EISA device to be detected when returning +from eisa_driver_register, since the chances are that the bus has not +yet been probed. In fact, that's what happens most of the time (the +bus root driver usually kicks in rather late in the boot process). +Unfortunately, most drivers are doing the probing by themselves, and +expect to have explored the whole machine when they exit their probe +routine. + +For example, switching your favorite EISA SCSI card to the "hotplug" +model is "the right thing"(tm). + +Thanks +====== + +I'd like to thank the following people for their help: + +- Xavier Benigni for lending me a wonderful Alpha Jensen, +- James Bottomley, Jeff Garzik for getting this stuff into the kernel, +- Andries Brouwer for contributing numerous EISA ids, +- Catrin Jones for coping with far too many machines at home. diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 9fb03b7bdeb1..d1c6513dd20d 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -68,7 +68,33 @@ available subsections can be seen below. fpga/index acpi/index backlight/lp855x-driver.rst + bt8xxgpio + connector + console + dcdbas + dell_rbu + edid + eisa + isa + isapnp generic-counter + lightnvm-pblk + men-chameleon-bus + ntb + nvmem + parport-lowlevel + pti_intel_mid + pwm + rfkill + sgi-ioc4 + sm501 + smsc_ece1099 + switchtec + sync_file + vfio-mediated-device + vfio + xillybus + zorro .. only:: subproject and html diff --git a/Documentation/driver-api/isa.rst b/Documentation/driver-api/isa.rst new file mode 100644 index 000000000000..def4a7b690b5 --- /dev/null +++ b/Documentation/driver-api/isa.rst @@ -0,0 +1,122 @@ +=========== +ISA Drivers +=========== + +The following text is adapted from the commit message of the initial +commit of the ISA bus driver authored by Rene Herman. + +During the recent "isa drivers using platform devices" discussion it was +pointed out that (ALSA) ISA drivers ran into the problem of not having +the option to fail driver load (device registration rather) upon not +finding their hardware due to a probe() error not being passed up +through the driver model. In the course of that, I suggested a separate +ISA bus might be best; Russell King agreed and suggested this bus could +use the .match() method for the actual device discovery. + +The attached does this. For this old non (generically) discoverable ISA +hardware only the driver itself can do discovery so as a difference with +the platform_bus, this isa_bus also distributes match() up to the +driver. + +As another difference: these devices only exist in the driver model due +to the driver creating them because it might want to drive them, meaning +that all device creation has been made internal as well. + +The usage model this provides is nice, and has been acked from the ALSA +side by Takashi Iwai and Jaroslav Kysela. The ALSA driver module_init's +now (for oldisa-only drivers) become:: + + static int __init alsa_card_foo_init(void) + { + return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS); + } + + static void __exit alsa_card_foo_exit(void) + { + isa_unregister_driver(&snd_foo_isa_driver); + } + +Quite like the other bus models therefore. This removes a lot of +duplicated init code from the ALSA ISA drivers. + +The passed in isa_driver struct is the regular driver struct embedding a +struct device_driver, the normal probe/remove/shutdown/suspend/resume +callbacks, and as indicated that .match callback. + +The "SNDRV_CARDS" you see being passed in is a "unsigned int ndev" +parameter, indicating how many devices to create and call our methods +with. + +The platform_driver callbacks are called with a platform_device param; +the isa_driver callbacks are being called with a ``struct device *dev, +unsigned int id`` pair directly -- with the device creation completely +internal to the bus it's much cleaner to not leak isa_dev's by passing +them in at all. The id is the only thing we ever want other then the +struct device anyways, and it makes for nicer code in the callbacks as +well. + +With this additional .match() callback ISA drivers have all options. If +ALSA would want to keep the old non-load behaviour, it could stick all +of the old .probe in .match, which would only keep them registered after +everything was found to be present and accounted for. If it wanted the +behaviour of always loading as it inadvertently did for a bit after the +changeover to platform devices, it could just not provide a .match() and +do everything in .probe() as before. + +If it, as Takashi Iwai already suggested earlier as a way of following +the model from saner buses more closely, wants to load when a later bind +could conceivably succeed, it could use .match() for the prerequisites +(such as checking the user wants the card enabled and that port/irq/dma +values have been passed in) and .probe() for everything else. This is +the nicest model. + +To the code... + +This exports only two functions; isa_{,un}register_driver(). + +isa_register_driver() register's the struct device_driver, and then +loops over the passed in ndev creating devices and registering them. +This causes the bus match method to be called for them, which is:: + + int isa_bus_match(struct device *dev, struct device_driver *driver) + { + struct isa_driver *isa_driver = to_isa_driver(driver); + + if (dev->platform_data == isa_driver) { + if (!isa_driver->match || + isa_driver->match(dev, to_isa_dev(dev)->id)) + return 1; + dev->platform_data = NULL; + } + return 0; + } + +The first thing this does is check if this device is in fact one of this +driver's devices by seeing if the device's platform_data pointer is set +to this driver. Platform devices compare strings, but we don't need to +do that with everything being internal, so isa_register_driver() abuses +dev->platform_data as a isa_driver pointer which we can then check here. +I believe platform_data is available for this, but if rather not, moving +the isa_driver pointer to the private struct isa_dev is ofcourse fine as +well. + +Then, if the the driver did not provide a .match, it matches. If it did, +the driver match() method is called to determine a match. + +If it did **not** match, dev->platform_data is reset to indicate this to +isa_register_driver which can then unregister the device again. + +If during all this, there's any error, or no devices matched at all +everything is backed out again and the error, or -ENODEV, is returned. + +isa_unregister_driver() just unregisters the matched devices and the +driver itself. + +module_isa_driver is a helper macro for ISA drivers which do not do +anything special in module init/exit. This eliminates a lot of +boilerplate code. Each module may only use this macro once, and calling +it replaces module_init and module_exit. + +max_num_isa_dev is a macro to determine the maximum possible number of +ISA devices which may be registered in the I/O port address space given +the address extent of the ISA devices. diff --git a/Documentation/driver-api/isapnp.rst b/Documentation/driver-api/isapnp.rst new file mode 100644 index 000000000000..8d0840ac847b --- /dev/null +++ b/Documentation/driver-api/isapnp.rst @@ -0,0 +1,15 @@ +========================================================== +ISA Plug & Play support by Jaroslav Kysela +========================================================== + +Interface /proc/isapnp +====================== + +The interface has been removed. See pnp.txt for more details. + +Interface /proc/bus/isapnp +========================== + +This directory allows access to ISA PnP cards and logical devices. +The regular files contain the contents of ISA PnP registers for +a logical device. diff --git a/Documentation/driver-api/lightnvm-pblk.rst b/Documentation/driver-api/lightnvm-pblk.rst new file mode 100644 index 000000000000..1040ed1cec81 --- /dev/null +++ b/Documentation/driver-api/lightnvm-pblk.rst @@ -0,0 +1,21 @@ +pblk: Physical Block Device Target +================================== + +pblk implements a fully associative, host-based FTL that exposes a traditional +block I/O interface. Its primary responsibilities are: + + - Map logical addresses onto physical addresses (4KB granularity) in a + logical-to-physical (L2P) table. + - Maintain the integrity and consistency of the L2P table as well as its + recovery from normal tear down and power outage. + - Deal with controller- and media-specific constrains. + - Handle I/O errors. + - Implement garbage collection. + - Maintain consistency across the I/O stack during synchronization points. + +For more information please refer to: + + http://lightnvm.io + +which maintains updated FAQs, manual pages, technical documentation, tools, +contacts, etc. diff --git a/Documentation/driver-api/men-chameleon-bus.rst b/Documentation/driver-api/men-chameleon-bus.rst new file mode 100644 index 000000000000..1b1f048aa748 --- /dev/null +++ b/Documentation/driver-api/men-chameleon-bus.rst @@ -0,0 +1,175 @@ +================= +MEN Chameleon Bus +================= + +.. Table of Contents + ================= + 1 Introduction + 1.1 Scope of this Document + 1.2 Limitations of the current implementation + 2 Architecture + 2.1 MEN Chameleon Bus + 2.2 Carrier Devices + 2.3 Parser + 3 Resource handling + 3.1 Memory Resources + 3.2 IRQs + 4 Writing an MCB driver + 4.1 The driver structure + 4.2 Probing and attaching + 4.3 Initializing the driver + + +Introduction +============ + +This document describes the architecture and implementation of the MEN +Chameleon Bus (called MCB throughout this document). + +Scope of this Document +---------------------- + +This document is intended to be a short overview of the current +implementation and does by no means describe the complete possibilities of MCB +based devices. + +Limitations of the current implementation +----------------------------------------- + +The current implementation is limited to PCI and PCIe based carrier devices +that only use a single memory resource and share the PCI legacy IRQ. Not +implemented are: + +- Multi-resource MCB devices like the VME Controller or M-Module carrier. +- MCB devices that need another MCB device, like SRAM for a DMA Controller's + buffer descriptors or a video controller's video memory. +- A per-carrier IRQ domain for carrier devices that have one (or more) IRQs + per MCB device like PCIe based carriers with MSI or MSI-X support. + +Architecture +============ + +MCB is divided into 3 functional blocks: + +- The MEN Chameleon Bus itself, +- drivers for MCB Carrier Devices and +- the parser for the Chameleon table. + +MEN Chameleon Bus +----------------- + +The MEN Chameleon Bus is an artificial bus system that attaches to a so +called Chameleon FPGA device found on some hardware produced my MEN Mikro +Elektronik GmbH. These devices are multi-function devices implemented in a +single FPGA and usually attached via some sort of PCI or PCIe link. Each +FPGA contains a header section describing the content of the FPGA. The +header lists the device id, PCI BAR, offset from the beginning of the PCI +BAR, size in the FPGA, interrupt number and some other properties currently +not handled by the MCB implementation. + +Carrier Devices +--------------- + +A carrier device is just an abstraction for the real world physical bus the +Chameleon FPGA is attached to. Some IP Core drivers may need to interact with +properties of the carrier device (like querying the IRQ number of a PCI +device). To provide abstraction from the real hardware bus, an MCB carrier +device provides callback methods to translate the driver's MCB function calls +to hardware related function calls. For example a carrier device may +implement the get_irq() method which can be translated into a hardware bus +query for the IRQ number the device should use. + +Parser +------ + +The parser reads the first 512 bytes of a Chameleon device and parses the +Chameleon table. Currently the parser only supports the Chameleon v2 variant +of the Chameleon table but can easily be adopted to support an older or +possible future variant. While parsing the table's entries new MCB devices +are allocated and their resources are assigned according to the resource +assignment in the Chameleon table. After resource assignment is finished, the +MCB devices are registered at the MCB and thus at the driver core of the +Linux kernel. + +Resource handling +================= + +The current implementation assigns exactly one memory and one IRQ resource +per MCB device. But this is likely going to change in the future. + +Memory Resources +---------------- + +Each MCB device has exactly one memory resource, which can be requested from +the MCB bus. This memory resource is the physical address of the MCB device +inside the carrier and is intended to be passed to ioremap() and friends. It +is already requested from the kernel by calling request_mem_region(). + +IRQs +---- + +Each MCB device has exactly one IRQ resource, which can be requested from the +MCB bus. If a carrier device driver implements the ->get_irq() callback +method, the IRQ number assigned by the carrier device will be returned, +otherwise the IRQ number inside the Chameleon table will be returned. This +number is suitable to be passed to request_irq(). + +Writing an MCB driver +===================== + +The driver structure +-------------------- + +Each MCB driver has a structure to identify the device driver as well as +device ids which identify the IP Core inside the FPGA. The driver structure +also contains callback methods which get executed on driver probe and +removal from the system:: + + static const struct mcb_device_id foo_ids[] = { + { .device = 0x123 }, + { } + }; + MODULE_DEVICE_TABLE(mcb, foo_ids); + + static struct mcb_driver foo_driver = { + driver = { + .name = "foo-bar", + .owner = THIS_MODULE, + }, + .probe = foo_probe, + .remove = foo_remove, + .id_table = foo_ids, + }; + +Probing and attaching +--------------------- + +When a driver is loaded and the MCB devices it services are found, the MCB +core will call the driver's probe callback method. When the driver is removed +from the system, the MCB core will call the driver's remove callback method:: + + static init foo_probe(struct mcb_device *mdev, const struct mcb_device_id *id); + static void foo_remove(struct mcb_device *mdev); + +Initializing the driver +----------------------- + +When the kernel is booted or your foo driver module is inserted, you have to +perform driver initialization. Usually it is enough to register your driver +module at the MCB core:: + + static int __init foo_init(void) + { + return mcb_register_driver(&foo_driver); + } + module_init(foo_init); + + static void __exit foo_exit(void) + { + mcb_unregister_driver(&foo_driver); + } + module_exit(foo_exit); + +The module_mcb_driver() macro can be used to reduce the above code:: + + module_mcb_driver(foo_driver); diff --git a/Documentation/driver-api/ntb.rst b/Documentation/driver-api/ntb.rst new file mode 100644 index 000000000000..074a423c853c --- /dev/null +++ b/Documentation/driver-api/ntb.rst @@ -0,0 +1,236 @@ +=========== +NTB Drivers +=========== + +NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects +the separate memory systems of two or more computers to the same PCI-Express +fabric. Existing NTB hardware supports a common feature set: doorbell +registers and memory translation windows, as well as non common features like +scratchpad and message registers. Scratchpad registers are read-and-writable +registers that are accessible from either side of the device, so that peers can +exchange a small amount of information at a fixed address. Message registers can +be utilized for the same purpose. Additionally they are provided with with +special status bits to make sure the information isn't rewritten by another +peer. Doorbell registers provide a way for peers to send interrupt events. +Memory windows allow translated read and write access to the peer memory. + +NTB Core Driver (ntb) +===================== + +The NTB core driver defines an api wrapping the common feature set, and allows +clients interested in NTB features to discover NTB the devices supported by +hardware drivers. The term "client" is used here to mean an upper layer +component making use of the NTB api. The term "driver," or "hardware driver," +is used here to mean a driver for a specific vendor and model of NTB hardware. + +NTB Client Drivers +================== + +NTB client drivers should register with the NTB core driver. After +registering, the client probe and remove functions will be called appropriately +as ntb hardware, or hardware drivers, are inserted and removed. The +registration uses the Linux Device framework, so it should feel familiar to +anyone who has written a pci driver. + +NTB Typical client driver implementation +---------------------------------------- + +Primary purpose of NTB is to share some peace of memory between at least two +systems. So the NTB device features like Scratchpad/Message registers are +mainly used to perform the proper memory window initialization. Typically +there are two types of memory window interfaces supported by the NTB API: +inbound translation configured on the local ntb port and outbound translation +configured by the peer, on the peer ntb port. The first type is +depicted on the next figure:: + + Inbound translation: + + Memory: Local NTB Port: Peer NTB Port: Peer MMIO: + ____________ + | dma-mapped |-ntb_mw_set_trans(addr) | + | memory | _v____________ | ______________ + | (addr) |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO + |------------| |--------------| | |--------------| + +So typical scenario of the first type memory window initialization looks: +1) allocate a memory region, 2) put translated address to NTB config, +3) somehow notify a peer device of performed initialization, 4) peer device +maps corresponding outbound memory window so to have access to the shared +memory region. + +The second type of interface, that implies the shared windows being +initialized by a peer device, is depicted on the figure:: + + Outbound translation: + + Memory: Local NTB Port: Peer NTB Port: Peer MMIO: + ____________ ______________ + | dma-mapped | | | MW base addr |<== memory-mapped IO + | memory | | |--------------| + | (addr) |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr) + |------------| | |--------------| + +Typical scenario of the second type interface initialization would be: +1) allocate a memory region, 2) somehow deliver a translated address to a peer +device, 3) peer puts the translated address to NTB config, 4) peer device maps +outbound memory window so to have access to the shared memory region. + +As one can see the described scenarios can be combined in one portable +algorithm. + + Local device: + 1) Allocate memory for a shared window + 2) Initialize memory window by translated address of the allocated region + (it may fail if local memory window initialization is unsupported) + 3) Send the translated address and memory window index to a peer device + + Peer device: + 1) Initialize memory window with retrieved address of the allocated + by another device memory region (it may fail if peer memory window + initialization is unsupported) + 2) Map outbound memory window + +In accordance with this scenario, the NTB Memory Window API can be used as +follows: + + Local device: + 1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can + be allocated for memory windows between local device and peer device + of port with specified index. + 2) ntb_get_align(pidx, midx) - retrieve parameters restricting the + shared memory region alignment and size. Then memory can be properly + allocated. + 3) Allocate physically contiguous memory region in compliance with + restrictions retrieved in 2). + 4) ntb_mw_set_trans(pidx, midx) - try to set translation address of + the memory window with specified index for the defined peer device + (it may fail if local translated address setting is not supported) + 5) Send translated base address (usually together with memory window + number) to the peer device using, for instance, scratchpad or message + registers. + + Peer device: + 1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other + device (related to pidx) translated address for specified memory + window. It may fail if retrieved address, for instance, exceeds + maximum possible address or isn't properly aligned. + 2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory + window so to have an access to the shared memory. + +Also it is worth to note, that method ntb_mw_count(pidx) should return the +same value as ntb_peer_mw_count() on the peer with port index - pidx. + +NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev) +------------------------------------------------------------------ + +The primary client for NTB is the Transport client, used in tandem with NTB +Netdev. These drivers function together to create a logical link to the peer, +across the ntb, to exchange packets of network data. The Transport client +establishes a logical link to the peer, and creates queue pairs to exchange +messages and data. The NTB Netdev then creates an ethernet device using a +Transport queue pair. Network data is copied between socket buffers and the +Transport queue pair buffer. The Transport client may be used for other things +besides Netdev, however no other applications have yet been written. + +NTB Ping Pong Test Client (ntb\_pingpong) +----------------------------------------- + +The Ping Pong test client serves as a demonstration to exercise the doorbell +and scratchpad registers of NTB hardware, and as an example simple NTB client. +Ping Pong enables the link when started, waits for the NTB link to come up, and +then proceeds to read and write the doorbell scratchpad registers of the NTB. +The peers interrupt each other using a bit mask of doorbell bits, which is +shifted by one in each round, to test the behavior of multiple doorbell bits +and interrupt vectors. The Ping Pong driver also reads the first local +scratchpad, and writes the value plus one to the first peer scratchpad, each +round before writing the peer doorbell register. + +Module Parameters: + +* unsafe - Some hardware has known issues with scratchpad and doorbell + registers. By default, Ping Pong will not attempt to exercise such + hardware. You may override this behavior at your own risk by setting + unsafe=1. +* delay\_ms - Specify the delay between receiving a doorbell + interrupt event and setting the peer doorbell register for the next + round. +* init\_db - Specify the doorbell bits to start new series of rounds. A new + series begins once all the doorbell bits have been shifted out of + range. +* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and + then to observe debugging output on the console. + +NTB Tool Test Client (ntb\_tool) +-------------------------------- + +The Tool test client serves for debugging, primarily, ntb hardware and drivers. +The Tool provides access through debugfs for reading, setting, and clearing the +NTB doorbell, and reading and writing scratchpads. + +The Tool does not currently have any module parameters. + +Debugfs Files: + +* *debugfs*/ntb\_tool/*hw*/ + A directory in debugfs will be created for each + NTB device probed by the tool. This directory is shortened to *hw* + below. +* *hw*/db + This file is used to read, set, and clear the local doorbell. Not + all operations may be supported by all hardware. To read the doorbell, + read the file. To set the doorbell, write `s` followed by the bits to + set (eg: `echo 's 0x0101' > db`). To clear the doorbell, write `c` + followed by the bits to clear. +* *hw*/mask + This file is used to read, set, and clear the local doorbell mask. + See *db* for details. +* *hw*/peer\_db + This file is used to read, set, and clear the peer doorbell. + See *db* for details. +* *hw*/peer\_mask + This file is used to read, set, and clear the peer doorbell + mask. See *db* for details. +* *hw*/spad + This file is used to read and write local scratchpads. To read + the values of all scratchpads, read the file. To write values, write a + series of pairs of scratchpad number and value + (eg: `echo '4 0x123 7 0xabc' > spad` + # to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively). +* *hw*/peer\_spad + This file is used to read and write peer scratchpads. See + *spad* for details. + +NTB Hardware Drivers +==================== + +NTB hardware drivers should register devices with the NTB core driver. After +registering, clients probe and remove functions will be called. + +NTB Intel Hardware Driver (ntb\_hw\_intel) +------------------------------------------ + +The Intel hardware driver supports NTB on Xeon and Atom CPUs. + +Module Parameters: + +* b2b\_mw\_idx + If the peer ntb is to be accessed via a memory window, then use + this memory window to access the peer ntb. A value of zero or positive + starts from the first mw idx, and a negative value starts from the last + mw idx. Both sides MUST set the same value here! The default value is + `-1`. +* b2b\_mw\_share + If the peer ntb is to be accessed via a memory window, and if + the memory window is large enough, still allow the client to use the + second half of the memory window for address translation to the peer. +* xeon\_b2b\_usd\_bar2\_addr64 + If using B2B topology on Xeon hardware, use + this 64 bit address on the bus between the NTB devices for the window + at BAR2, on the upstream side of the link. +* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*. +* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*. diff --git a/Documentation/driver-api/nvmem.rst b/Documentation/driver-api/nvmem.rst new file mode 100644 index 000000000000..d9d958d5c824 --- /dev/null +++ b/Documentation/driver-api/nvmem.rst @@ -0,0 +1,189 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +NVMEM Subsystem +=============== + + Srinivas Kandagatla + +This document explains the NVMEM Framework along with the APIs provided, +and how to use it. + +1. Introduction +=============== +*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to +retrieve configuration of SOC or Device specific data from non volatile +memories like eeprom, efuses and so on. + +Before this framework existed, NVMEM drivers like eeprom were stored in +drivers/misc, where they all had to duplicate pretty much the same code to +register a sysfs file, allow in-kernel users to access the content of the +devices they were driving, etc. + +This was also a problem as far as other in-kernel users were involved, since +the solutions used were pretty much different from one driver to another, there +was a rather big abstraction leak. + +This framework aims at solve these problems. It also introduces DT +representation for consumer devices to go get the data they require (MAC +Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This +framework is based on regmap, so that most of the abstraction available in +regmap can be reused, across multiple types of buses. + +NVMEM Providers ++++++++++++++++ + +NVMEM provider refers to an entity that implements methods to initialize, read +and write the non-volatile memory. + +2. Registering/Unregistering the NVMEM provider +=============================================== + +A NVMEM provider can register with NVMEM core by supplying relevant +nvmem configuration to nvmem_register(), on success core would return a valid +nvmem_device pointer. + +nvmem_unregister(nvmem) is used to unregister a previously registered provider. + +For example, a simple qfprom case:: + + static struct nvmem_config econfig = { + .name = "qfprom", + .owner = THIS_MODULE, + }; + + static int qfprom_probe(struct platform_device *pdev) + { + ... + econfig.dev = &pdev->dev; + nvmem = nvmem_register(&econfig); + ... + } + +It is mandatory that the NVMEM provider has a regmap associated with its +struct device. Failure to do would return error code from nvmem_register(). + +Users of board files can define and register nvmem cells using the +nvmem_cell_table struct:: + + static struct nvmem_cell_info foo_nvmem_cells[] = { + { + .name = "macaddr", + .offset = 0x7f00, + .bytes = ETH_ALEN, + } + }; + + static struct nvmem_cell_table foo_nvmem_cell_table = { + .nvmem_name = "i2c-eeprom", + .cells = foo_nvmem_cells, + .ncells = ARRAY_SIZE(foo_nvmem_cells), + }; + + nvmem_add_cell_table(&foo_nvmem_cell_table); + +Additionally it is possible to create nvmem cell lookup entries and register +them with the nvmem framework from machine code as shown in the example below:: + + static struct nvmem_cell_lookup foo_nvmem_lookup = { + .nvmem_name = "i2c-eeprom", + .cell_name = "macaddr", + .dev_id = "foo_mac.0", + .con_id = "mac-address", + }; + + nvmem_add_cell_lookups(&foo_nvmem_lookup, 1); + +NVMEM Consumers ++++++++++++++++ + +NVMEM consumers are the entities which make use of the NVMEM provider to +read from and to NVMEM. + +3. NVMEM cell based consumer APIs +================================= + +NVMEM cells are the data entries/fields in the NVMEM. +The NVMEM framework provides 3 APIs to read/write NVMEM cells:: + + struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name); + struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name); + + void nvmem_cell_put(struct nvmem_cell *cell); + void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell); + + void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len); + int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len); + +`*nvmem_cell_get()` apis will get a reference to nvmem cell for a given id, +and nvmem_cell_read/write() can then read or write to the cell. +Once the usage of the cell is finished the consumer should call +`*nvmem_cell_put()` to free all the allocation memory for the cell. + +4. Direct NVMEM device based consumer APIs +========================================== + +In some instances it is necessary to directly read/write the NVMEM. +To facilitate such consumers NVMEM framework provides below apis:: + + struct nvmem_device *nvmem_device_get(struct device *dev, const char *name); + struct nvmem_device *devm_nvmem_device_get(struct device *dev, + const char *name); + void nvmem_device_put(struct nvmem_device *nvmem); + int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset, + size_t bytes, void *buf); + int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset, + size_t bytes, void *buf); + int nvmem_device_cell_read(struct nvmem_device *nvmem, + struct nvmem_cell_info *info, void *buf); + int nvmem_device_cell_write(struct nvmem_device *nvmem, + struct nvmem_cell_info *info, void *buf); + +Before the consumers can read/write NVMEM directly, it should get hold +of nvmem_controller from one of the `*nvmem_device_get()` api. + +The difference between these apis and cell based apis is that these apis always +take nvmem_device as parameter. + +5. Releasing a reference to the NVMEM +===================================== + +When a consumer no longer needs the NVMEM, it has to release the reference +to the NVMEM it has obtained using the APIs mentioned in the above section. +The NVMEM framework provides 2 APIs to release a reference to the NVMEM:: + + void nvmem_cell_put(struct nvmem_cell *cell); + void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell); + void nvmem_device_put(struct nvmem_device *nvmem); + void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem); + +Both these APIs are used to release a reference to the NVMEM and +devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated +with this NVMEM. + +Userspace ++++++++++ + +6. Userspace binary interface +============================== + +Userspace can read/write the raw NVMEM file located at:: + + /sys/bus/nvmem/devices/*/nvmem + +ex:: + + hexdump /sys/bus/nvmem/devices/qfprom0/nvmem + + 0000000 0000 0000 0000 0000 0000 0000 0000 0000 + * + 00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00 + 0000000 0000 0000 0000 0000 0000 0000 0000 0000 + ... + * + 0001000 + +7. DeviceTree Binding +===================== + +See Documentation/devicetree/bindings/nvmem/nvmem.txt diff --git a/Documentation/driver-api/parport-lowlevel.rst b/Documentation/driver-api/parport-lowlevel.rst new file mode 100644 index 000000000000..0633d70ffda7 --- /dev/null +++ b/Documentation/driver-api/parport-lowlevel.rst @@ -0,0 +1,1832 @@ +=============================== +PARPORT interface documentation +=============================== + +:Time-stamp: <2000-02-24 13:30:20 twaugh> + +Described here are the following functions: + +Global functions:: + parport_register_driver + parport_unregister_driver + parport_enumerate + parport_register_device + parport_unregister_device + parport_claim + parport_claim_or_block + parport_release + parport_yield + parport_yield_blocking + parport_wait_peripheral + parport_poll_peripheral + parport_wait_event + parport_negotiate + parport_read + parport_write + parport_open + parport_close + parport_device_id + parport_device_coords + parport_find_class + parport_find_device + parport_set_timeout + +Port functions (can be overridden by low-level drivers): + + SPP:: + port->ops->read_data + port->ops->write_data + port->ops->read_status + port->ops->read_control + port->ops->write_control + port->ops->frob_control + port->ops->enable_irq + port->ops->disable_irq + port->ops->data_forward + port->ops->data_reverse + + EPP:: + port->ops->epp_write_data + port->ops->epp_read_data + port->ops->epp_write_addr + port->ops->epp_read_addr + + ECP:: + port->ops->ecp_write_data + port->ops->ecp_read_data + port->ops->ecp_write_addr + + Other:: + port->ops->nibble_read_data + port->ops->byte_read_data + port->ops->compat_write_data + +The parport subsystem comprises ``parport`` (the core port-sharing +code), and a variety of low-level drivers that actually do the port +accesses. Each low-level driver handles a particular style of port +(PC, Amiga, and so on). + +The parport interface to the device driver author can be broken down +into global functions and port functions. + +The global functions are mostly for communicating between the device +driver and the parport subsystem: acquiring a list of available ports, +claiming a port for exclusive use, and so on. They also include +``generic`` functions for doing standard things that will work on any +IEEE 1284-capable architecture. + +The port functions are provided by the low-level drivers, although the +core parport module provides generic ``defaults`` for some routines. +The port functions can be split into three groups: SPP, EPP, and ECP. + +SPP (Standard Parallel Port) functions modify so-called ``SPP`` +registers: data, status, and control. The hardware may not actually +have registers exactly like that, but the PC does and this interface is +modelled after common PC implementations. Other low-level drivers may +be able to emulate most of the functionality. + +EPP (Enhanced Parallel Port) functions are provided for reading and +writing in IEEE 1284 EPP mode, and ECP (Extended Capabilities Port) +functions are used for IEEE 1284 ECP mode. (What about BECP? Does +anyone care?) + +Hardware assistance for EPP and/or ECP transfers may or may not be +available, and if it is available it may or may not be used. If +hardware is not used, the transfer will be software-driven. In order +to cope with peripherals that only tenuously support IEEE 1284, a +low-level driver specific function is provided, for altering 'fudge +factors'. + +Global functions +================ + +parport_register_driver - register a device driver with parport +--------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_driver { + const char *name; + void (*attach) (struct parport *); + void (*detach) (struct parport *); + struct parport_driver *next; + }; + int parport_register_driver (struct parport_driver *driver); + +DESCRIPTION +^^^^^^^^^^^ + +In order to be notified about parallel ports when they are detected, +parport_register_driver should be called. Your driver will +immediately be notified of all ports that have already been detected, +and of each new port as low-level drivers are loaded. + +A ``struct parport_driver`` contains the textual name of your driver, +a pointer to a function to handle new ports, and a pointer to a +function to handle ports going away due to a low-level driver +unloading. Ports will only be detached if they are not being used +(i.e. there are no devices registered on them). + +The visible parts of the ``struct parport *`` argument given to +attach/detach are:: + + struct parport + { + struct parport *next; /* next parport in list */ + const char *name; /* port's name */ + unsigned int modes; /* bitfield of hardware modes */ + struct parport_device_info probe_info; + /* IEEE1284 info */ + int number; /* parport index */ + struct parport_operations *ops; + ... + }; + +There are other members of the structure, but they should not be +touched. + +The ``modes`` member summarises the capabilities of the underlying +hardware. It consists of flags which may be bitwise-ored together: + + ============================= =============================================== + PARPORT_MODE_PCSPP IBM PC registers are available, + i.e. functions that act on data, + control and status registers are + probably writing directly to the + hardware. + PARPORT_MODE_TRISTATE The data drivers may be turned off. + This allows the data lines to be used + for reverse (peripheral to host) + transfers. + PARPORT_MODE_COMPAT The hardware can assist with + compatibility-mode (printer) + transfers, i.e. compat_write_block. + PARPORT_MODE_EPP The hardware can assist with EPP + transfers. + PARPORT_MODE_ECP The hardware can assist with ECP + transfers. + PARPORT_MODE_DMA The hardware can use DMA, so you might + want to pass ISA DMA-able memory + (i.e. memory allocated using the + GFP_DMA flag with kmalloc) to the + low-level driver in order to take + advantage of it. + ============================= =============================================== + +There may be other flags in ``modes`` as well. + +The contents of ``modes`` is advisory only. For example, if the +hardware is capable of DMA, and PARPORT_MODE_DMA is in ``modes``, it +doesn't necessarily mean that DMA will always be used when possible. +Similarly, hardware that is capable of assisting ECP transfers won't +necessarily be used. + +RETURN VALUE +^^^^^^^^^^^^ + +Zero on success, otherwise an error code. + +ERRORS +^^^^^^ + +None. (Can it fail? Why return int?) + +EXAMPLE +^^^^^^^ + +:: + + static void lp_attach (struct parport *port) + { + ... + private = kmalloc (...); + dev[count++] = parport_register_device (...); + ... + } + + static void lp_detach (struct parport *port) + { + ... + } + + static struct parport_driver lp_driver = { + "lp", + lp_attach, + lp_detach, + NULL /* always put NULL here */ + }; + + int lp_init (void) + { + ... + if (parport_register_driver (&lp_driver)) { + /* Failed; nothing we can do. */ + return -EIO; + } + ... + } + + +SEE ALSO +^^^^^^^^ + +parport_unregister_driver, parport_register_device, parport_enumerate + + + +parport_unregister_driver - tell parport to forget about this driver +-------------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_driver { + const char *name; + void (*attach) (struct parport *); + void (*detach) (struct parport *); + struct parport_driver *next; + }; + void parport_unregister_driver (struct parport_driver *driver); + +DESCRIPTION +^^^^^^^^^^^ + +This tells parport not to notify the device driver of new ports or of +ports going away. Registered devices belonging to that driver are NOT +unregistered: parport_unregister_device must be used for each one. + +EXAMPLE +^^^^^^^ + +:: + + void cleanup_module (void) + { + ... + /* Stop notifications. */ + parport_unregister_driver (&lp_driver); + + /* Unregister devices. */ + for (i = 0; i < NUM_DEVS; i++) + parport_unregister_device (dev[i]); + ... + } + +SEE ALSO +^^^^^^^^ + +parport_register_driver, parport_enumerate + + + +parport_enumerate - retrieve a list of parallel ports (DEPRECATED) +------------------------------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport *parport_enumerate (void); + +DESCRIPTION +^^^^^^^^^^^ + +Retrieve the first of a list of valid parallel ports for this machine. +Successive parallel ports can be found using the ``struct parport +*next`` element of the ``struct parport *`` that is returned. If ``next`` +is NULL, there are no more parallel ports in the list. The number of +ports in the list will not exceed PARPORT_MAX. + +RETURN VALUE +^^^^^^^^^^^^ + +A ``struct parport *`` describing a valid parallel port for the machine, +or NULL if there are none. + +ERRORS +^^^^^^ + +This function can return NULL to indicate that there are no parallel +ports to use. + +EXAMPLE +^^^^^^^ + +:: + + int detect_device (void) + { + struct parport *port; + + for (port = parport_enumerate (); + port != NULL; + port = port->next) { + /* Try to detect a device on the port... */ + ... + } + } + + ... + } + +NOTES +^^^^^ + +parport_enumerate is deprecated; parport_register_driver should be +used instead. + +SEE ALSO +^^^^^^^^ + +parport_register_driver, parport_unregister_driver + + + +parport_register_device - register to use a port +------------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + typedef int (*preempt_func) (void *handle); + typedef void (*wakeup_func) (void *handle); + typedef int (*irq_func) (int irq, void *handle, struct pt_regs *); + + struct pardevice *parport_register_device(struct parport *port, + const char *name, + preempt_func preempt, + wakeup_func wakeup, + irq_func irq, + int flags, + void *handle); + +DESCRIPTION +^^^^^^^^^^^ + +Use this function to register your device driver on a parallel port +(``port``). Once you have done that, you will be able to use +parport_claim and parport_release in order to use the port. + +The (``name``) argument is the name of the device that appears in /proc +filesystem. The string must be valid for the whole lifetime of the +device (until parport_unregister_device is called). + +This function will register three callbacks into your driver: +``preempt``, ``wakeup`` and ``irq``. Each of these may be NULL in order to +indicate that you do not want a callback. + +When the ``preempt`` function is called, it is because another driver +wishes to use the parallel port. The ``preempt`` function should return +non-zero if the parallel port cannot be released yet -- if zero is +returned, the port is lost to another driver and the port must be +re-claimed before use. + +The ``wakeup`` function is called once another driver has released the +port and no other driver has yet claimed it. You can claim the +parallel port from within the ``wakeup`` function (in which case the +claim is guaranteed to succeed), or choose not to if you don't need it +now. + +If an interrupt occurs on the parallel port your driver has claimed, +the ``irq`` function will be called. (Write something about shared +interrupts here.) + +The ``handle`` is a pointer to driver-specific data, and is passed to +the callback functions. + +``flags`` may be a bitwise combination of the following flags: + + ===================== ================================================= + Flag Meaning + ===================== ================================================= + PARPORT_DEV_EXCL The device cannot share the parallel port at all. + Use this only when absolutely necessary. + ===================== ================================================= + +The typedefs are not actually defined -- they are only shown in order +to make the function prototype more readable. + +The visible parts of the returned ``struct pardevice`` are:: + + struct pardevice { + struct parport *port; /* Associated port */ + void *private; /* Device driver's 'handle' */ + ... + }; + +RETURN VALUE +^^^^^^^^^^^^ + +A ``struct pardevice *``: a handle to the registered parallel port +device that can be used for parport_claim, parport_release, etc. + +ERRORS +^^^^^^ + +A return value of NULL indicates that there was a problem registering +a device on that port. + +EXAMPLE +^^^^^^^ + +:: + + static int preempt (void *handle) + { + if (busy_right_now) + return 1; + + must_reclaim_port = 1; + return 0; + } + + static void wakeup (void *handle) + { + struct toaster *private = handle; + struct pardevice *dev = private->dev; + if (!dev) return; /* avoid races */ + + if (want_port) + parport_claim (dev); + } + + static int toaster_detect (struct toaster *private, struct parport *port) + { + private->dev = parport_register_device (port, "toaster", preempt, + wakeup, NULL, 0, + private); + if (!private->dev) + /* Couldn't register with parport. */ + return -EIO; + + must_reclaim_port = 0; + busy_right_now = 1; + parport_claim_or_block (private->dev); + ... + /* Don't need the port while the toaster warms up. */ + busy_right_now = 0; + ... + busy_right_now = 1; + if (must_reclaim_port) { + parport_claim_or_block (private->dev); + must_reclaim_port = 0; + } + ... + } + +SEE ALSO +^^^^^^^^ + +parport_unregister_device, parport_claim + + + +parport_unregister_device - finish using a port +----------------------------------------------- + +SYNPOPSIS + +:: + + #include + + void parport_unregister_device (struct pardevice *dev); + +DESCRIPTION +^^^^^^^^^^^ + +This function is the opposite of parport_register_device. After using +parport_unregister_device, ``dev`` is no longer a valid device handle. + +You should not unregister a device that is currently claimed, although +if you do it will be released automatically. + +EXAMPLE +^^^^^^^ + +:: + + ... + kfree (dev->private); /* before we lose the pointer */ + parport_unregister_device (dev); + ... + +SEE ALSO +^^^^^^^^ + + +parport_unregister_driver + +parport_claim, parport_claim_or_block - claim the parallel port for a device +---------------------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_claim (struct pardevice *dev); + int parport_claim_or_block (struct pardevice *dev); + +DESCRIPTION +^^^^^^^^^^^ + +These functions attempt to gain control of the parallel port on which +``dev`` is registered. ``parport_claim`` does not block, but +``parport_claim_or_block`` may do. (Put something here about blocking +interruptibly or non-interruptibly.) + +You should not try to claim a port that you have already claimed. + +RETURN VALUE +^^^^^^^^^^^^ + +A return value of zero indicates that the port was successfully +claimed, and the caller now has possession of the parallel port. + +If ``parport_claim_or_block`` blocks before returning successfully, the +return value is positive. + +ERRORS +^^^^^^ + +========== ========================================================== + -EAGAIN The port is unavailable at the moment, but another attempt + to claim it may succeed. +========== ========================================================== + +SEE ALSO +^^^^^^^^ + + +parport_release + +parport_release - release the parallel port +------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + void parport_release (struct pardevice *dev); + +DESCRIPTION +^^^^^^^^^^^ + +Once a parallel port device has been claimed, it can be released using +``parport_release``. It cannot fail, but you should not release a +device that you do not have possession of. + +EXAMPLE +^^^^^^^ + +:: + + static size_t write (struct pardevice *dev, const void *buf, + size_t len) + { + ... + written = dev->port->ops->write_ecp_data (dev->port, buf, + len); + parport_release (dev); + ... + } + + +SEE ALSO +^^^^^^^^ + +change_mode, parport_claim, parport_claim_or_block, parport_yield + + + +parport_yield, parport_yield_blocking - temporarily release a parallel port +--------------------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_yield (struct pardevice *dev) + int parport_yield_blocking (struct pardevice *dev); + +DESCRIPTION +^^^^^^^^^^^ + +When a driver has control of a parallel port, it may allow another +driver to temporarily ``borrow`` it. ``parport_yield`` does not block; +``parport_yield_blocking`` may do. + +RETURN VALUE +^^^^^^^^^^^^ + +A return value of zero indicates that the caller still owns the port +and the call did not block. + +A positive return value from ``parport_yield_blocking`` indicates that +the caller still owns the port and the call blocked. + +A return value of -EAGAIN indicates that the caller no longer owns the +port, and it must be re-claimed before use. + +ERRORS +^^^^^^ + +========= ========================================================== + -EAGAIN Ownership of the parallel port was given away. +========= ========================================================== + +SEE ALSO +^^^^^^^^ + +parport_release + + + +parport_wait_peripheral - wait for status lines, up to 35ms +----------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_wait_peripheral (struct parport *port, + unsigned char mask, + unsigned char val); + +DESCRIPTION +^^^^^^^^^^^ + +Wait for the status lines in mask to match the values in val. + +RETURN VALUE +^^^^^^^^^^^^ + +======== ========================================================== + -EINTR a signal is pending + 0 the status lines in mask have values in val + 1 timed out while waiting (35ms elapsed) +======== ========================================================== + +SEE ALSO +^^^^^^^^ + +parport_poll_peripheral + + + +parport_poll_peripheral - wait for status lines, in usec +-------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_poll_peripheral (struct parport *port, + unsigned char mask, + unsigned char val, + int usec); + +DESCRIPTION +^^^^^^^^^^^ + +Wait for the status lines in mask to match the values in val. + +RETURN VALUE +^^^^^^^^^^^^ + +======== ========================================================== + -EINTR a signal is pending + 0 the status lines in mask have values in val + 1 timed out while waiting (usec microseconds have elapsed) +======== ========================================================== + +SEE ALSO +^^^^^^^^ + +parport_wait_peripheral + + + +parport_wait_event - wait for an event on a port +------------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_wait_event (struct parport *port, signed long timeout) + +DESCRIPTION +^^^^^^^^^^^ + +Wait for an event (e.g. interrupt) on a port. The timeout is in +jiffies. + +RETURN VALUE +^^^^^^^^^^^^ + +======= ========================================================== + 0 success + <0 error (exit as soon as possible) + >0 timed out +======= ========================================================== + +parport_negotiate - perform IEEE 1284 negotiation +------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_negotiate (struct parport *, int mode); + +DESCRIPTION +^^^^^^^^^^^ + +Perform IEEE 1284 negotiation. + +RETURN VALUE +^^^^^^^^^^^^ + +======= ========================================================== + 0 handshake OK; IEEE 1284 peripheral and mode available + -1 handshake failed; peripheral not compliant (or none present) + 1 handshake OK; IEEE 1284 peripheral present but mode not + available +======= ========================================================== + +SEE ALSO +^^^^^^^^ + +parport_read, parport_write + + + +parport_read - read data from device +------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + ssize_t parport_read (struct parport *, void *buf, size_t len); + +DESCRIPTION +^^^^^^^^^^^ + +Read data from device in current IEEE 1284 transfer mode. This only +works for modes that support reverse data transfer. + +RETURN VALUE +^^^^^^^^^^^^ + +If negative, an error code; otherwise the number of bytes transferred. + +SEE ALSO +^^^^^^^^ + +parport_write, parport_negotiate + + + +parport_write - write data to device +------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + ssize_t parport_write (struct parport *, const void *buf, size_t len); + +DESCRIPTION +^^^^^^^^^^^ + +Write data to device in current IEEE 1284 transfer mode. This only +works for modes that support forward data transfer. + +RETURN VALUE +^^^^^^^^^^^^ + +If negative, an error code; otherwise the number of bytes transferred. + +SEE ALSO +^^^^^^^^ + +parport_read, parport_negotiate + + + +parport_open - register device for particular device number +----------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct pardevice *parport_open (int devnum, const char *name, + int (*pf) (void *), + void (*kf) (void *), + void (*irqf) (int, void *, + struct pt_regs *), + int flags, void *handle); + +DESCRIPTION +^^^^^^^^^^^ + +This is like parport_register_device but takes a device number instead +of a pointer to a struct parport. + +RETURN VALUE +^^^^^^^^^^^^ + +See parport_register_device. If no device is associated with devnum, +NULL is returned. + +SEE ALSO +^^^^^^^^ + +parport_register_device + + + +parport_close - unregister device for particular device number +-------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + void parport_close (struct pardevice *dev); + +DESCRIPTION +^^^^^^^^^^^ + +This is the equivalent of parport_unregister_device for parport_open. + +SEE ALSO +^^^^^^^^ + +parport_unregister_device, parport_open + + + +parport_device_id - obtain IEEE 1284 Device ID +---------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + ssize_t parport_device_id (int devnum, char *buffer, size_t len); + +DESCRIPTION +^^^^^^^^^^^ + +Obtains the IEEE 1284 Device ID associated with a given device. + +RETURN VALUE +^^^^^^^^^^^^ + +If negative, an error code; otherwise, the number of bytes of buffer +that contain the device ID. The format of the device ID is as +follows:: + + [length][ID] + +The first two bytes indicate the inclusive length of the entire Device +ID, and are in big-endian order. The ID is a sequence of pairs of the +form:: + + key:value; + +NOTES +^^^^^ + +Many devices have ill-formed IEEE 1284 Device IDs. + +SEE ALSO +^^^^^^^^ + +parport_find_class, parport_find_device + + + +parport_device_coords - convert device number to device coordinates +------------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_device_coords (int devnum, int *parport, int *mux, + int *daisy); + +DESCRIPTION +^^^^^^^^^^^ + +Convert between device number (zero-based) and device coordinates +(port, multiplexor, daisy chain address). + +RETURN VALUE +^^^^^^^^^^^^ + +Zero on success, in which case the coordinates are (``*parport``, ``*mux``, +``*daisy``). + +SEE ALSO +^^^^^^^^ + +parport_open, parport_device_id + + + +parport_find_class - find a device by its class +----------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + typedef enum { + PARPORT_CLASS_LEGACY = 0, /* Non-IEEE1284 device */ + PARPORT_CLASS_PRINTER, + PARPORT_CLASS_MODEM, + PARPORT_CLASS_NET, + PARPORT_CLASS_HDC, /* Hard disk controller */ + PARPORT_CLASS_PCMCIA, + PARPORT_CLASS_MEDIA, /* Multimedia device */ + PARPORT_CLASS_FDC, /* Floppy disk controller */ + PARPORT_CLASS_PORTS, + PARPORT_CLASS_SCANNER, + PARPORT_CLASS_DIGCAM, + PARPORT_CLASS_OTHER, /* Anything else */ + PARPORT_CLASS_UNSPEC, /* No CLS field in ID */ + PARPORT_CLASS_SCSIADAPTER + } parport_device_class; + + int parport_find_class (parport_device_class cls, int from); + +DESCRIPTION +^^^^^^^^^^^ + +Find a device by class. The search starts from device number from+1. + +RETURN VALUE +^^^^^^^^^^^^ + +The device number of the next device in that class, or -1 if no such +device exists. + +NOTES +^^^^^ + +Example usage:: + + int devnum = -1; + while ((devnum = parport_find_class (PARPORT_CLASS_DIGCAM, devnum)) != -1) { + struct pardevice *dev = parport_open (devnum, ...); + ... + } + +SEE ALSO +^^^^^^^^ + +parport_find_device, parport_open, parport_device_id + + + +parport_find_device - find a device by its class +------------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + int parport_find_device (const char *mfg, const char *mdl, int from); + +DESCRIPTION +^^^^^^^^^^^ + +Find a device by vendor and model. The search starts from device +number from+1. + +RETURN VALUE +^^^^^^^^^^^^ + +The device number of the next device matching the specifications, or +-1 if no such device exists. + +NOTES +^^^^^ + +Example usage:: + + int devnum = -1; + while ((devnum = parport_find_device ("IOMEGA", "ZIP+", devnum)) != -1) { + struct pardevice *dev = parport_open (devnum, ...); + ... + } + +SEE ALSO +^^^^^^^^ + +parport_find_class, parport_open, parport_device_id + + + +parport_set_timeout - set the inactivity timeout +------------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + long parport_set_timeout (struct pardevice *dev, long inactivity); + +DESCRIPTION +^^^^^^^^^^^ + +Set the inactivity timeout, in jiffies, for a registered device. The +previous timeout is returned. + +RETURN VALUE +^^^^^^^^^^^^ + +The previous timeout, in jiffies. + +NOTES +^^^^^ + +Some of the port->ops functions for a parport may take time, owing to +delays at the peripheral. After the peripheral has not responded for +``inactivity`` jiffies, a timeout will occur and the blocking function +will return. + +A timeout of 0 jiffies is a special case: the function must do as much +as it can without blocking or leaving the hardware in an unknown +state. If port operations are performed from within an interrupt +handler, for instance, a timeout of 0 jiffies should be used. + +Once set for a registered device, the timeout will remain at the set +value until set again. + +SEE ALSO +^^^^^^^^ + +port->ops->xxx_read/write_yyy + + + + +PORT FUNCTIONS +============== + +The functions in the port->ops structure (struct parport_operations) +are provided by the low-level driver responsible for that port. + +port->ops->read_data - read the data register +--------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + unsigned char (*read_data) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +If port->modes contains the PARPORT_MODE_TRISTATE flag and the +PARPORT_CONTROL_DIRECTION bit in the control register is set, this +returns the value on the data pins. If port->modes contains the +PARPORT_MODE_TRISTATE flag and the PARPORT_CONTROL_DIRECTION bit is +not set, the return value _may_ be the last value written to the data +register. Otherwise the return value is undefined. + +SEE ALSO +^^^^^^^^ + +write_data, read_status, write_control + + + +port->ops->write_data - write the data register +----------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*write_data) (struct parport *port, unsigned char d); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes to the data register. May have side-effects (a STROBE pulse, +for instance). + +SEE ALSO +^^^^^^^^ + +read_data, read_status, write_control + + + +port->ops->read_status - read the status register +------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + unsigned char (*read_status) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads from the status register. This is a bitmask: + +- PARPORT_STATUS_ERROR (printer fault, "nFault") +- PARPORT_STATUS_SELECT (on-line, "Select") +- PARPORT_STATUS_PAPEROUT (no paper, "PError") +- PARPORT_STATUS_ACK (handshake, "nAck") +- PARPORT_STATUS_BUSY (busy, "Busy") + +There may be other bits set. + +SEE ALSO +^^^^^^^^ + +read_data, write_data, write_control + + + +port->ops->read_control - read the control register +--------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + unsigned char (*read_control) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Returns the last value written to the control register (either from +write_control or frob_control). No port access is performed. + +SEE ALSO +^^^^^^^^ + +read_data, write_data, read_status, write_control + + + +port->ops->write_control - write the control register +----------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*write_control) (struct parport *port, unsigned char s); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes to the control register. This is a bitmask:: + + _______ + - PARPORT_CONTROL_STROBE (nStrobe) + _______ + - PARPORT_CONTROL_AUTOFD (nAutoFd) + _____ + - PARPORT_CONTROL_INIT (nInit) + _________ + - PARPORT_CONTROL_SELECT (nSelectIn) + +SEE ALSO +^^^^^^^^ + +read_data, write_data, read_status, frob_control + + + +port->ops->frob_control - write control register bits +----------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + unsigned char (*frob_control) (struct parport *port, + unsigned char mask, + unsigned char val); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +This is equivalent to reading from the control register, masking out +the bits in mask, exclusive-or'ing with the bits in val, and writing +the result to the control register. + +As some ports don't allow reads from the control port, a software copy +of its contents is maintained, so frob_control is in fact only one +port access. + +SEE ALSO +^^^^^^^^ + +read_data, write_data, read_status, write_control + + + +port->ops->enable_irq - enable interrupt generation +--------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*enable_irq) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +The parallel port hardware is instructed to generate interrupts at +appropriate moments, although those moments are +architecture-specific. For the PC architecture, interrupts are +commonly generated on the rising edge of nAck. + +SEE ALSO +^^^^^^^^ + +disable_irq + + + +port->ops->disable_irq - disable interrupt generation +----------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*disable_irq) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +The parallel port hardware is instructed not to generate interrupts. +The interrupt itself is not masked. + +SEE ALSO +^^^^^^^^ + +enable_irq + + + +port->ops->data_forward - enable data drivers +--------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*data_forward) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Enables the data line drivers, for 8-bit host-to-peripheral +communications. + +SEE ALSO +^^^^^^^^ + +data_reverse + + + +port->ops->data_reverse - tristate the buffer +--------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + void (*data_reverse) (struct parport *port); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Places the data bus in a high impedance state, if port->modes has the +PARPORT_MODE_TRISTATE bit set. + +SEE ALSO +^^^^^^^^ + +data_forward + + + +port->ops->epp_write_data - write EPP data +------------------------------------------ + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*epp_write_data) (struct parport *port, const void *buf, + size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes data in EPP mode, and returns the number of bytes written. + +The ``flags`` parameter may be one or more of the following, +bitwise-or'ed together: + +======================= ================================================= +PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and + 32-bit registers. However, if a transfer + times out, the return value may be unreliable. +======================= ================================================= + +SEE ALSO +^^^^^^^^ + +epp_read_data, epp_write_addr, epp_read_addr + + + +port->ops->epp_read_data - read EPP data +---------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*epp_read_data) (struct parport *port, void *buf, + size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads data in EPP mode, and returns the number of bytes read. + +The ``flags`` parameter may be one or more of the following, +bitwise-or'ed together: + +======================= ================================================= +PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and + 32-bit registers. However, if a transfer + times out, the return value may be unreliable. +======================= ================================================= + +SEE ALSO +^^^^^^^^ + +epp_write_data, epp_write_addr, epp_read_addr + + + +port->ops->epp_write_addr - write EPP address +--------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*epp_write_addr) (struct parport *port, + const void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes EPP addresses (8 bits each), and returns the number written. + +The ``flags`` parameter may be one or more of the following, +bitwise-or'ed together: + +======================= ================================================= +PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and + 32-bit registers. However, if a transfer + times out, the return value may be unreliable. +======================= ================================================= + +(Does PARPORT_EPP_FAST make sense for this function?) + +SEE ALSO +^^^^^^^^ + +epp_write_data, epp_read_data, epp_read_addr + + + +port->ops->epp_read_addr - read EPP address +------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*epp_read_addr) (struct parport *port, void *buf, + size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads EPP addresses (8 bits each), and returns the number read. + +The ``flags`` parameter may be one or more of the following, +bitwise-or'ed together: + +======================= ================================================= +PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and + 32-bit registers. However, if a transfer + times out, the return value may be unreliable. +======================= ================================================= + +(Does PARPORT_EPP_FAST make sense for this function?) + +SEE ALSO +^^^^^^^^ + +epp_write_data, epp_read_data, epp_write_addr + + + +port->ops->ecp_write_data - write a block of ECP data +----------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*ecp_write_data) (struct parport *port, + const void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes a block of ECP data. The ``flags`` parameter is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of bytes written. + +SEE ALSO +^^^^^^^^ + +ecp_read_data, ecp_write_addr + + + +port->ops->ecp_read_data - read a block of ECP data +--------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*ecp_read_data) (struct parport *port, + void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads a block of ECP data. The ``flags`` parameter is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of bytes read. NB. There may be more unread data in a +FIFO. Is there a way of stunning the FIFO to prevent this? + +SEE ALSO +^^^^^^^^ + +ecp_write_block, ecp_write_addr + + + +port->ops->ecp_write_addr - write a block of ECP addresses +---------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*ecp_write_addr) (struct parport *port, + const void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes a block of ECP addresses. The ``flags`` parameter is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of bytes written. + +NOTES +^^^^^ + +This may use a FIFO, and if so shall not return until the FIFO is empty. + +SEE ALSO +^^^^^^^^ + +ecp_read_data, ecp_write_data + + + +port->ops->nibble_read_data - read a block of data in nibble mode +----------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*nibble_read_data) (struct parport *port, + void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads a block of data in nibble mode. The ``flags`` parameter is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of whole bytes read. + +SEE ALSO +^^^^^^^^ + +byte_read_data, compat_write_data + + + +port->ops->byte_read_data - read a block of data in byte mode +------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*byte_read_data) (struct parport *port, + void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Reads a block of data in byte mode. The ``flags`` parameter is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of bytes read. + +SEE ALSO +^^^^^^^^ + +nibble_read_data, compat_write_data + + + +port->ops->compat_write_data - write a block of data in compatibility mode +-------------------------------------------------------------------------- + +SYNOPSIS +^^^^^^^^ + +:: + + #include + + struct parport_operations { + ... + size_t (*compat_write_data) (struct parport *port, + const void *buf, size_t len, int flags); + ... + }; + +DESCRIPTION +^^^^^^^^^^^ + +Writes a block of data in compatibility mode. The ``flags`` parameter +is ignored. + +RETURN VALUE +^^^^^^^^^^^^ + +The number of bytes written. + +SEE ALSO +^^^^^^^^ + +nibble_read_data, byte_read_data diff --git a/Documentation/driver-api/pti_intel_mid.rst b/Documentation/driver-api/pti_intel_mid.rst new file mode 100644 index 000000000000..20f1cff42d5f --- /dev/null +++ b/Documentation/driver-api/pti_intel_mid.rst @@ -0,0 +1,106 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Intel MID PTI +============= + +The Intel MID PTI project is HW implemented in Intel Atom +system-on-a-chip designs based on the Parallel Trace +Interface for MIPI P1149.7 cJTAG standard. The kernel solution +for this platform involves the following files:: + + ./include/linux/pti.h + ./drivers/.../n_tracesink.h + ./drivers/.../n_tracerouter.c + ./drivers/.../n_tracesink.c + ./drivers/.../pti.c + +pti.c is the driver that enables various debugging features +popular on platforms from certain mobile manufacturers. +n_tracerouter.c and n_tracesink.c allow extra system information to +be collected and routed to the pti driver, such as trace +debugging data from a modem. Although n_tracerouter +and n_tracesink are a part of the complete PTI solution, +these two line disciplines can work separately from +pti.c and route any data stream from one /dev/tty node +to another /dev/tty node via kernel-space. This provides +a stable, reliable connection that will not break unless +the user-space application shuts down (plus avoids +kernel->user->kernel context switch overheads of routing +data). + +An example debugging usage for this driver system: + + * Hook /dev/ttyPTI0 to syslogd. Opening this port will also start + a console device to further capture debugging messages to PTI. + * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW. + This is where n_tracerouter and n_tracesink are used. + * Hook /dev/pti to a user-level debugging application for writing + to PTI HW. + * `Use mipi_` Kernel Driver API in other device drivers for + debugging to PTI by first requesting a PTI write address via + mipi_request_masterchannel(1). + +Below is example pseudo-code on how a 'privileged' application +can hook up n_tracerouter and n_tracesink to any tty on +a system. 'Privileged' means the application has enough +privileges to successfully manipulate the ldisc drivers +but is not just blindly executing as 'root'. Keep in mind +the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter +and n_tracesink line discpline drivers but is a generic +operation for a program to use a line discpline driver +on a tty port other than the default n_tty:: + + /////////// To hook up n_tracerouter and n_tracesink ///////// + + // Note that n_tracerouter depends on n_tracesink. + #include + #define ONE_TTY "/dev/ttyOne" + #define TWO_TTY "/dev/ttyTwo" + + // needed global to hand onto ldisc connection + static int g_fd_source = -1; + static int g_fd_sink = -1; + + // these two vars used to grab LDISC values from loaded ldisc drivers + // in OS. Look at /proc/tty/ldiscs to get the right numbers from + // the ldiscs loaded in the system. + int source_ldisc_num, sink_ldisc_num = -1; + int retval; + + g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W + g_fd_sink = open(TWO_TTY, O_RDWR); // must be R/W + + if (g_fd_source <= 0) || (g_fd_sink <= 0) { + // doubt you'll want to use these exact error lines of code + printf("Error on open(). errno: %d\n",errno); + return errno; + } + + retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num); + if (retval < 0) { + printf("Error on ioctl(). errno: %d\n", errno); + return errno; + } + + retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num); + if (retval < 0) { + printf("Error on ioctl(). errno: %d\n", errno); + return errno; + } + + /////////// To disconnect n_tracerouter and n_tracesink //////// + + // First make sure data through the ldiscs has stopped. + + // Second, disconnect ldiscs. This provides a + // little cleaner shutdown on tty stack. + sink_ldisc_num = 0; + source_ldisc_num = 0; + ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num); + ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num); + + // Three, program closes connection, and cleanup: + close(g_fd_uart); + close(g_fd_gadget); + g_fd_uart = g_fd_gadget = NULL; diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst new file mode 100644 index 000000000000..ab62f1bb0366 --- /dev/null +++ b/Documentation/driver-api/pwm.rst @@ -0,0 +1,165 @@ +====================================== +Pulse Width Modulation (PWM) interface +====================================== + +This provides an overview about the Linux PWM interface + +PWMs are commonly used for controlling LEDs, fans or vibrators in +cell phones. PWMs with a fixed purpose have no need implementing +the Linux PWM API (although they could). However, PWMs are often +found as discrete devices on SoCs which have no fixed purpose. It's +up to the board designer to connect them to LEDs or fans. To provide +this kind of flexibility the generic PWM API exists. + +Identifying PWMs +---------------- + +Users of the legacy PWM API use unique IDs to refer to PWM devices. + +Instead of referring to a PWM device via its unique ID, board setup code +should instead register a static mapping that can be used to match PWM +consumers to providers, as given in the following example:: + + static struct pwm_lookup board_pwm_lookup[] = { + PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL, + 50000, PWM_POLARITY_NORMAL), + }; + + static void __init board_init(void) + { + ... + pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup)); + ... + } + +Using PWMs +---------- + +Legacy users can request a PWM device using pwm_request() and free it +after usage with pwm_free(). + +New users should use the pwm_get() function and pass to it the consumer +device or a consumer name. pwm_put() is used to free the PWM device. Managed +variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist. + +After being requested, a PWM has to be configured using:: + + int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state); + +This API controls both the PWM period/duty_cycle config and the +enable/disable state. + +The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers +around pwm_apply_state() and should not be used if the user wants to change +several parameter at once. For example, if you see pwm_config() and +pwm_{enable,disable}() calls in the same function, this probably means you +should switch to pwm_apply_state(). + +The PWM user API also allows one to query the PWM state with pwm_get_state(). + +In addition to the PWM state, the PWM API also exposes PWM arguments, which +are the reference PWM config one should use on this PWM. +PWM arguments are usually platform-specific and allows the PWM user to only +care about dutycycle relatively to the full period (like, duty = 50% of the +period). struct pwm_args contains 2 fields (period and polarity) and should +be used to set the initial PWM config (usually done in the probe function +of the PWM user). PWM arguments are retrieved with pwm_get_args(). + +All consumers should really be reconfiguring the PWM upon resume as +appropriate. This is the only way to ensure that everything is resumed in +the proper order. + +Using PWMs with the sysfs interface +----------------------------------- + +If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs +interface is provided to use the PWMs from userspace. It is exposed at +/sys/class/pwm/. Each probed PWM controller/chip will be exported as +pwmchipN, where N is the base of the PWM chip. Inside the directory you +will find: + + npwm + The number of PWM channels this chip supports (read-only). + + export + Exports a PWM channel for use with sysfs (write-only). + + unexport + Unexports a PWM channel from sysfs (write-only). + +The PWM channels are numbered using a per-chip index from 0 to npwm-1. + +When a PWM channel is exported a pwmX directory will be created in the +pwmchipN directory it is associated with, where X is the number of the +channel that was exported. The following properties will then be available: + + period + The total period of the PWM signal (read/write). + Value is in nanoseconds and is the sum of the active and inactive + time of the PWM. + + duty_cycle + The active time of the PWM signal (read/write). + Value is in nanoseconds and must be less than the period. + + polarity + Changes the polarity of the PWM signal (read/write). + Writes to this property only work if the PWM chip supports changing + the polarity. The polarity can only be changed if the PWM is not + enabled. Value is the string "normal" or "inversed". + + enable + Enable/disable the PWM signal (read/write). + + - 0 - disabled + - 1 - enabled + +Implementing a PWM driver +------------------------- + +Currently there are two ways to implement pwm drivers. Traditionally +there only has been the barebone API meaning that each driver has +to implement the pwm_*() functions itself. This means that it's impossible +to have multiple PWM drivers in the system. For this reason it's mandatory +for new drivers to use the generic PWM framework. + +A new PWM controller/chip can be added using pwmchip_add() and removed +again with pwmchip_remove(). pwmchip_add() takes a filled in struct +pwm_chip as argument which provides a description of the PWM chip, the +number of PWM devices provided by the chip and the chip-specific +implementation of the supported PWM operations to the framework. + +When implementing polarity support in a PWM driver, make sure to respect the +signal conventions in the PWM framework. By definition, normal polarity +characterizes a signal starts high for the duration of the duty cycle and +goes low for the remainder of the period. Conversely, a signal with inversed +polarity starts low for the duration of the duty cycle and goes high for the +remainder of the period. + +Drivers are encouraged to implement ->apply() instead of the legacy +->enable(), ->disable() and ->config() methods. Doing that should provide +atomicity in the PWM config workflow, which is required when the PWM controls +a critical device (like a regulator). + +The implementation of ->get_state() (a method used to retrieve initial PWM +state) is also encouraged for the same reason: letting the PWM user know +about the current PWM state would allow him to avoid glitches. + +Drivers should not implement any power management. In other words, +consumers should implement it as described in the "Using PWMs" section. + +Locking +------- + +The PWM core list manipulations are protected by a mutex, so pwm_request() +and pwm_free() may not be called from an atomic context. Currently the +PWM core does not enforce any locking to pwm_enable(), pwm_disable() and +pwm_config(), so the calling context is currently driver specific. This +is an issue derived from the former barebone API and should be fixed soon. + +Helpers +------- + +Currently a PWM can only be configured with period_ns and duty_ns. For several +use cases freq_hz and duty_percent might be better. Instead of calculating +this in your driver please consider adding appropriate helpers to the framework. diff --git a/Documentation/driver-api/rfkill.rst b/Documentation/driver-api/rfkill.rst new file mode 100644 index 000000000000..7d3684e81df6 --- /dev/null +++ b/Documentation/driver-api/rfkill.rst @@ -0,0 +1,132 @@ +=============================== +rfkill - RF kill switch support +=============================== + + +.. contents:: + :depth: 2 + +Introduction +============ + +The rfkill subsystem provides a generic interface for disabling any radio +transmitter in the system. When a transmitter is blocked, it shall not +radiate any power. + +The subsystem also provides the ability to react on button presses and +disable all transmitters of a certain type (or all). This is intended for +situations where transmitters need to be turned off, for example on +aircraft. + +The rfkill subsystem has a concept of "hard" and "soft" block, which +differ little in their meaning (block == transmitters off) but rather in +whether they can be changed or not: + + - hard block + read-only radio block that cannot be overridden by software + + - soft block + writable radio block (need not be readable) that is set by + the system software. + +The rfkill subsystem has two parameters, rfkill.default_state and +rfkill.master_switch_mode, which are documented in +admin-guide/kernel-parameters.rst. + + +Implementation details +====================== + +The rfkill subsystem is composed of three main components: + + * the rfkill core, + * the deprecated rfkill-input module (an input layer handler, being + replaced by userspace policy code) and + * the rfkill drivers. + +The rfkill core provides API for kernel drivers to register their radio +transmitter with the kernel, methods for turning it on and off, and letting +the system know about hardware-disabled states that may be implemented on +the device. + +The rfkill core code also notifies userspace of state changes, and provides +ways for userspace to query the current states. See the "Userspace support" +section below. + +When the device is hard-blocked (either by a call to rfkill_set_hw_state() +or from query_hw_block), set_block() will be invoked for additional software +block, but drivers can ignore the method call since they can use the return +value of the function rfkill_set_hw_state() to sync the software state +instead of keeping track of calls to set_block(). In fact, drivers should +use the return value of rfkill_set_hw_state() unless the hardware actually +keeps track of soft and hard block separately. + + +Kernel API +========== + +Drivers for radio transmitters normally implement an rfkill driver. + +Platform drivers might implement input devices if the rfkill button is just +that, a button. If that button influences the hardware then you need to +implement an rfkill driver instead. This also applies if the platform provides +a way to turn on/off the transmitter(s). + +For some platforms, it is possible that the hardware state changes during +suspend/hibernation, in which case it will be necessary to update the rfkill +core with the current state at resume time. + +To create an rfkill driver, driver's Kconfig needs to have:: + + depends on RFKILL || !RFKILL + +to ensure the driver cannot be built-in when rfkill is modular. The !RFKILL +case allows the driver to be built when rfkill is not configured, in which +case all rfkill API can still be used but will be provided by static inlines +which compile to almost nothing. + +Calling rfkill_set_hw_state() when a state change happens is required from +rfkill drivers that control devices that can be hard-blocked unless they also +assign the poll_hw_block() callback (then the rfkill core will poll the +device). Don't do this unless you cannot get the event in any other way. + +rfkill provides per-switch LED triggers, which can be used to drive LEDs +according to the switch state (LED_FULL when blocked, LED_OFF otherwise). + + +Userspace support +================= + +The recommended userspace interface to use is /dev/rfkill, which is a misc +character device that allows userspace to obtain and set the state of rfkill +devices and sets of devices. It also notifies userspace about device addition +and removal. The API is a simple read/write API that is defined in +linux/rfkill.h, with one ioctl that allows turning off the deprecated input +handler in the kernel for the transition period. + +Except for the one ioctl, communication with the kernel is done via read() +and write() of instances of 'struct rfkill_event'. In this structure, the +soft and hard block are properly separated (unlike sysfs, see below) and +userspace is able to get a consistent snapshot of all rfkill devices in the +system. Also, it is possible to switch all rfkill drivers (or all drivers of +a specified type) into a state which also updates the default state for +hotplugged devices. + +After an application opens /dev/rfkill, it can read the current state of all +devices. Changes can be obtained by either polling the descriptor for +hotplug or state change events or by listening for uevents emitted by the +rfkill core framework. + +Additionally, each rfkill device is registered in sysfs and emits uevents. + +rfkill devices issue uevents (with an action of "change"), with the following +environment variables set:: + + RFKILL_NAME + RFKILL_STATE + RFKILL_TYPE + +The content of these variables corresponds to the "name", "state" and +"type" sysfs files explained above. + +For further details consult Documentation/ABI/stable/sysfs-class-rfkill. diff --git a/Documentation/driver-api/sgi-ioc4.rst b/Documentation/driver-api/sgi-ioc4.rst new file mode 100644 index 000000000000..72709222d3c0 --- /dev/null +++ b/Documentation/driver-api/sgi-ioc4.rst @@ -0,0 +1,49 @@ +==================================== +SGI IOC4 PCI (multi function) device +==================================== + +The SGI IOC4 PCI device is a bit of a strange beast, so some notes on +it are in order. + +First, even though the IOC4 performs multiple functions, such as an +IDE controller, a serial controller, a PS/2 keyboard/mouse controller, +and an external interrupt mechanism, it's not implemented as a +multifunction device. The consequence of this from a software +standpoint is that all these functions share a single IRQ, and +they can't all register to own the same PCI device ID. To make +matters a bit worse, some of the register blocks (and even registers +themselves) present in IOC4 are mixed-purpose between these several +functions, meaning that there's no clear "owning" device driver. + +The solution is to organize the IOC4 driver into several independent +drivers, "ioc4", "sgiioc4", and "ioc4_serial". Note that there is no +PS/2 controller driver as this functionality has never been wired up +on a shipping IO card. + +ioc4 +==== +This is the core (or shim) driver for IOC4. It is responsible for +initializing the basic functionality of the chip, and allocating +the PCI resources that are shared between the IOC4 functions. + +This driver also provides registration functions that the other +IOC4 drivers can call to make their presence known. Each driver +needs to provide a probe and remove function, which are invoked +by the core driver at appropriate times. The interface of these +IOC4 function probe and remove operations isn't precisely the same +as PCI device probe and remove operations, but is logically the +same operation. + +sgiioc4 +======= +This is the IDE driver for IOC4. Its name isn't very descriptive +simply for historical reasons (it used to be the only IOC4 driver +component). There's not much to say about it other than it hooks +up to the ioc4 driver via the appropriate registration, probe, and +remove functions. + +ioc4_serial +=========== +This is the serial driver for IOC4. There's not much to say about it +other than it hooks up to the ioc4 driver via the appropriate registration, +probe, and remove functions. diff --git a/Documentation/driver-api/sm501.rst b/Documentation/driver-api/sm501.rst new file mode 100644 index 000000000000..882507453ba4 --- /dev/null +++ b/Documentation/driver-api/sm501.rst @@ -0,0 +1,74 @@ +.. include:: + +============ +SM501 Driver +============ + +:Copyright: |copy| 2006, 2007 Simtec Electronics + +The Silicon Motion SM501 multimedia companion chip is a multifunction device +which may provide numerous interfaces including USB host controller USB gadget, +asynchronous serial ports, audio functions, and a dual display video interface. +The device may be connected by PCI or local bus with varying functions enabled. + +Core +---- + +The core driver in drivers/mfd provides common services for the +drivers which manage the specific hardware blocks. These services +include locking for common registers, clock control and resource +management. + +The core registers drivers for both PCI and generic bus based +chips via the platform device and driver system. + +On detection of a device, the core initialises the chip (which may +be specified by the platform data) and then exports the selected +peripheral set as platform devices for the specific drivers. + +The core re-uses the platform device system as the platform device +system provides enough features to support the drivers without the +need to create a new bus-type and the associated code to go with it. + + +Resources +--------- + +Each peripheral has a view of the device which is implicitly narrowed to +the specific set of resources that peripheral requires in order to +function correctly. + +The centralised memory allocation allows the driver to ensure that the +maximum possible resource allocation can be made to the video subsystem +as this is by-far the most resource-sensitive of the on-chip functions. + +The primary issue with memory allocation is that of moving the video +buffers once a display mode is chosen. Indeed when a video mode change +occurs the memory footprint of the video subsystem changes. + +Since video memory is difficult to move without changing the display +(unless sufficient contiguous memory can be provided for the old and new +modes simultaneously) the video driver fully utilises the memory area +given to it by aligning fb0 to the start of the area and fb1 to the end +of it. Any memory left over in the middle is used for the acceleration +functions, which are transient and thus their location is less critical +as it can be moved. + + +Configuration +------------- + +The platform device driver uses a set of platform data to pass +configurations through to the core and the subsidiary drivers +so that there can be support for more than one system carrying +an SM501 built into a single kernel image. + +The PCI driver assumes that the PCI card behaves as per the Silicon +Motion reference design. + +There is an errata (AB-5) affecting the selection of the +of the M1XCLK and M1CLK frequencies. These two clocks +must be sourced from the same PLL, although they can then +be divided down individually. If this is not set, then SM501 may +lock and hang the whole system. The driver will refuse to +attach if the PLL selection is different. diff --git a/Documentation/driver-api/smsc_ece1099.rst b/Documentation/driver-api/smsc_ece1099.rst new file mode 100644 index 000000000000..079277421eaf --- /dev/null +++ b/Documentation/driver-api/smsc_ece1099.rst @@ -0,0 +1,60 @@ +================================================= +Msc Keyboard Scan Expansion/GPIO Expansion device +================================================= + +What is smsc-ece1099? +---------------------- + +The ECE1099 is a 40-Pin 3.3V Keyboard Scan Expansion +or GPIO Expansion device. The device supports a keyboard +scan matrix of 23x8. The device is connected to a Master +via the SMSC BC-Link interface or via the SMBus. +Keypad scan Input(KSI) and Keypad Scan Output(KSO) signals +are multiplexed with GPIOs. + +Interrupt generation +-------------------- + +Interrupts can be generated by an edge detection on a GPIO +pin or an edge detection on one of the bus interface pins. +Interrupts can also be detected on the keyboard scan interface. +The bus interrupt pin (BC_INT# or SMBUS_INT#) is asserted if +any bit in one of the Interrupt Status registers is 1 and +the corresponding Interrupt Mask bit is also 1. + +In order for software to determine which device is the source +of an interrupt, it should first read the Group Interrupt Status Register +to determine which Status register group is a source for the interrupt. +Software should read both the Status register and the associated Mask register, +then AND the two values together. Bits that are 1 in the result of the AND +are active interrupts. Software clears an interrupt by writing a 1 to the +corresponding bit in the Status register. + +Communication Protocol +---------------------- + +- SMbus slave Interface + The host processor communicates with the ECE1099 device + through a series of read/write registers via the SMBus + interface. SMBus is a serial communication protocol between + a computer host and its peripheral devices. The SMBus data + rate is 10KHz minimum to 400 KHz maximum + +- Slave Bus Interface + The ECE1099 device SMBus implementation is a subset of the + SMBus interface to the host. The device is a slave-only SMBus device. + The implementation in the device is a subset of SMBus since it + only supports four protocols. + + The Write Byte, Read Byte, Send Byte, and Receive Byte protocols are the + only valid SMBus protocols for the device. + +- BC-LinkTM Interface + The BC-Link is a proprietary bus that allows communication + between a Master device and a Companion device. The Master + device uses this serial bus to read and write registers + located on the Companion device. The bus comprises three signals, + BC_CLK, BC_DAT and BC_INT#. The Master device always provides the + clock, BC_CLK, and the Companion device is the source for an + independent asynchronous interrupt signal, BC_INT#. The ECE1099 + supports BC-Link speeds up to 24MHz. diff --git a/Documentation/driver-api/switchtec.rst b/Documentation/driver-api/switchtec.rst new file mode 100644 index 000000000000..7611fdc53e19 --- /dev/null +++ b/Documentation/driver-api/switchtec.rst @@ -0,0 +1,102 @@ +======================== +Linux Switchtec Support +======================== + +Microsemi's "Switchtec" line of PCI switch devices is already +supported by the kernel with standard PCI switch drivers. However, the +Switchtec device advertises a special management endpoint which +enables some additional functionality. This includes: + +* Packet and Byte Counters +* Firmware Upgrades +* Event and Error logs +* Querying port link status +* Custom user firmware commands + +The switchtec kernel module implements this functionality. + + +Interface +========= + +The primary means of communicating with the Switchtec management firmware is +through the Memory-mapped Remote Procedure Call (MRPC) interface. +Commands are submitted to the interface with a 4-byte command +identifier and up to 1KB of command specific data. The firmware will +respond with a 4-byte return code and up to 1KB of command-specific +data. The interface only processes a single command at a time. + + +Userspace Interface +=================== + +The MRPC interface will be exposed to userspace through a simple char +device: /dev/switchtec#, one for each management endpoint in the system. + +The char device has the following semantics: + +* A write must consist of at least 4 bytes and no more than 1028 bytes. + The first 4 bytes will be interpreted as the Command ID and the + remainder will be used as the input data. A write will send the + command to the firmware to begin processing. + +* Each write must be followed by exactly one read. Any double write will + produce an error and any read that doesn't follow a write will + produce an error. + +* A read will block until the firmware completes the command and return + the 4-byte Command Return Value plus up to 1024 bytes of output + data. (The length will be specified by the size parameter of the read + call -- reading less than 4 bytes will produce an error.) + +* The poll call will also be supported for userspace applications that + need to do other things while waiting for the command to complete. + +The following IOCTLs are also supported by the device: + +* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number + of partitions in the device. + +* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for + any specified partition in flash. + +* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps + indicating all uncleared events. + +* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags + for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct + with the event_id, index and flags set (index being the partition or PFF + number for non-global events). It returns whether the event has + occurred, the number of times and any event specific data. The flags + can be used to clear the count or enable and disable actions to + happen when the event occurs. + By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag, + you can set an event to trigger a poll command to return with + POLLPRI. In this way, userspace can wait for events to occur. + +* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert + between PCI Function Framework number (used by the event system) + and Switchtec Logic Port ID and Partition number (which is more + user friendly). + + +Non-Transparent Bridge (NTB) Driver +=================================== + +An NTB hardware driver is provided for the Switchtec hardware in +ntb_hw_switchtec. Currently, it only supports switches configured with +exactly 2 NT partitions and zero or more non-NT partitions. It also requires +the following configuration settings: + +* Both NT partitions must be able to access each other's GAS spaces. + Thus, the bits in the GAS Access Vector under Management Settings + must be set to support this. +* Kernel configuration MUST include support for NTB (CONFIG_NTB needs + to be set) + +NT EP BAR 2 will be dynamically configured as a Direct Window, and +the configuration file does not need to configure it explicitly. + +Please refer to Documentation/driver-api/ntb.rst in Linux source tree for an overall +understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB +Hardware Driver in this stack. diff --git a/Documentation/driver-api/sync_file.rst b/Documentation/driver-api/sync_file.rst new file mode 100644 index 000000000000..496fb2c3b3e6 --- /dev/null +++ b/Documentation/driver-api/sync_file.rst @@ -0,0 +1,86 @@ +=================== +Sync File API Guide +=================== + +:Author: Gustavo Padovan + +This document serves as a guide for device drivers writers on what the +sync_file API is, and how drivers can support it. Sync file is the carrier of +the fences(struct dma_fence) that are needed to synchronize between drivers or +across process boundaries. + +The sync_file API is meant to be used to send and receive fence information +to/from userspace. It enables userspace to do explicit fencing, where instead +of attaching a fence to the buffer a producer driver (such as a GPU or V4L +driver) sends the fence related to the buffer to userspace via a sync_file. + +The sync_file then can be sent to the consumer (DRM driver for example), that +will not use the buffer for anything before the fence(s) signals, i.e., the +driver that issued the fence is not using/processing the buffer anymore, so it +signals that the buffer is ready to use. And vice-versa for the consumer -> +producer part of the cycle. + +Sync files allows userspace awareness on buffer sharing synchronization between +drivers. + +Sync file was originally added in the Android kernel but current Linux Desktop +can benefit a lot from it. + +in-fences and out-fences +------------------------ + +Sync files can go either to or from userspace. When a sync_file is sent from +the driver to userspace we call the fences it contains 'out-fences'. They are +related to a buffer that the driver is processing or is going to process, so +the driver creates an out-fence to be able to notify, through +dma_fence_signal(), when it has finished using (or processing) that buffer. +Out-fences are fences that the driver creates. + +On the other hand if the driver receives fence(s) through a sync_file from +userspace we call these fence(s) 'in-fences'. Receiving in-fences means that +we need to wait for the fence(s) to signal before using any buffer related to +the in-fences. + +Creating Sync Files +------------------- + +When a driver needs to send an out-fence userspace it creates a sync_file. + +Interface:: + + struct sync_file *sync_file_create(struct dma_fence *fence); + +The caller pass the out-fence and gets back the sync_file. That is just the +first step, next it needs to install an fd on sync_file->file. So it gets an +fd:: + + fd = get_unused_fd_flags(O_CLOEXEC); + +and installs it on sync_file->file:: + + fd_install(fd, sync_file->file); + +The sync_file fd now can be sent to userspace. + +If the creation process fail, or the sync_file needs to be released by any +other reason fput(sync_file->file) should be used. + +Receiving Sync Files from Userspace +----------------------------------- + +When userspace needs to send an in-fence to the driver it passes file descriptor +of the Sync File to the kernel. The kernel can then retrieve the fences +from it. + +Interface:: + + struct dma_fence *sync_file_get_fence(int fd); + + +The returned reference is owned by the caller and must be disposed of +afterwards using dma_fence_put(). In case of error, a NULL is returned instead. + +References: + +1. struct sync_file in include/linux/sync_file.h +2. All interfaces mentioned above defined in include/linux/sync_file.h diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst new file mode 100644 index 000000000000..25eb7d5b834b --- /dev/null +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -0,0 +1,414 @@ +.. include:: + +===================== +VFIO Mediated devices +===================== + +:Copyright: |copy| 2016, NVIDIA CORPORATION. All rights reserved. +:Author: Neo Jia +:Author: Kirti Wankhede + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License version 2 as +published by the Free Software Foundation. + + +Virtual Function I/O (VFIO) Mediated devices[1] +=============================================== + +The number of use cases for virtualizing DMA devices that do not have built-in +SR_IOV capability is increasing. Previously, to virtualize such devices, +developers had to create their own management interfaces and APIs, and then +integrate them with user space software. To simplify integration with user space +software, we have identified common requirements and a unified management +interface for such devices. + +The VFIO driver framework provides unified APIs for direct device access. It is +an IOMMU/device-agnostic framework for exposing direct device access to user +space in a secure, IOMMU-protected environment. This framework is used for +multiple devices, such as GPUs, network adapters, and compute accelerators. With +direct device access, virtual machines or user space applications have direct +access to the physical device. This framework is reused for mediated devices. + +The mediated core driver provides a common interface for mediated device +management that can be used by drivers of different devices. This module +provides a generic interface to perform these operations: + +* Create and destroy a mediated device +* Add a mediated device to and remove it from a mediated bus driver +* Add a mediated device to and remove it from an IOMMU group + +The mediated core driver also provides an interface to register a bus driver. +For example, the mediated VFIO mdev driver is designed for mediated devices and +supports VFIO APIs. The mediated bus driver adds a mediated device to and +removes it from a VFIO group. + +The following high-level block diagram shows the main components and interfaces +in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM +devices as examples, as these devices are the first devices to use this module:: + + +---------------+ + | | + | +-----------+ | mdev_register_driver() +--------------+ + | | | +<------------------------+ | + | | mdev | | | | + | | bus | +------------------------>+ vfio_mdev.ko |<-> VFIO user + | | driver | | probe()/remove() | | APIs + | | | | +--------------+ + | +-----------+ | + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +-----------+ | mdev_register_device() +--------------+ + | | | +<------------------------+ | + | | | | | nvidia.ko |<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | | Physical | | + | | device | | mdev_register_device() +--------------+ + | | interface | |<------------------------+ | + | | | | | i915.ko |<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | | | | + | | | | mdev_register_device() +--------------+ + | | | +<------------------------+ | + | | | | | ccw_device.ko|<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | +-----------+ | + +---------------+ + + +Registration Interfaces +======================= + +The mediated core driver provides the following types of registration +interfaces: + +* Registration interface for a mediated bus driver +* Physical device driver interface + +Registration Interface for a Mediated Bus Driver +------------------------------------------------ + +The registration interface for a mediated bus driver provides the following +structure to represent a mediated device's driver:: + + /* + * struct mdev_driver [2] - Mediated device's driver + * @name: driver name + * @probe: called when new device created + * @remove: called when device removed + * @driver: device driver structure + */ + struct mdev_driver { + const char *name; + int (*probe) (struct device *dev); + void (*remove) (struct device *dev); + struct device_driver driver; + }; + +A mediated bus driver for mdev should use this structure in the function calls +to register and unregister itself with the core driver: + +* Register:: + + extern int mdev_register_driver(struct mdev_driver *drv, + struct module *owner); + +* Unregister:: + + extern void mdev_unregister_driver(struct mdev_driver *drv); + +The mediated bus driver is responsible for adding mediated devices to the VFIO +group when devices are bound to the driver and removing mediated devices from +the VFIO when devices are unbound from the driver. + + +Physical Device Driver Interface +-------------------------------- + +The physical device driver interface provides the mdev_parent_ops[3] structure +to define the APIs to manage work in the mediated core driver that is related +to the physical device. + +The structures in the mdev_parent_ops structure are as follows: + +* dev_attr_groups: attributes of the parent device +* mdev_attr_groups: attributes of the mediated device +* supported_config: attributes to define supported configurations + +The functions in the mdev_parent_ops structure are as follows: + +* create: allocate basic resources in a driver for a mediated device +* remove: free resources in a driver when a mediated device is destroyed + +(Note that mdev-core provides no implicit serialization of create/remove +callbacks per mdev parent device, per mdev type, or any other categorization. +Vendor drivers are expected to be fully asynchronous in this respect or +provide their own internal resource protection.) + +The callbacks in the mdev_parent_ops structure are as follows: + +* open: open callback of mediated device +* close: close callback of mediated device +* ioctl: ioctl callback of mediated device +* read : read emulation callback +* write: write emulation callback +* mmap: mmap emulation callback + +A driver should use the mdev_parent_ops structure in the function call to +register itself with the mdev core driver:: + + extern int mdev_register_device(struct device *dev, + const struct mdev_parent_ops *ops); + +However, the mdev_parent_ops structure is not required in the function call +that a driver should use to unregister itself with the mdev core driver:: + + extern void mdev_unregister_device(struct device *dev); + + +Mediated Device Management Interface Through sysfs +================================================== + +The management interface through sysfs enables user space software, such as +libvirt, to query and configure mediated devices in a hardware-agnostic fashion. +This management interface provides flexibility to the underlying physical +device's driver to support features such as: + +* Mediated device hot plug +* Multiple mediated devices in a single virtual machine +* Multiple mediated devices from different physical devices + +Links in the mdev_bus Class Directory +------------------------------------- +The /sys/class/mdev_bus/ directory contains links to devices that are registered +with the mdev core driver. + +Directories and files under the sysfs for Each Physical Device +-------------------------------------------------------------- + +:: + + |- [parent physical device] + |--- Vendor-specific-attributes [optional] + |--- [mdev_supported_types] + | |--- [] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [] + | | |--- create + | | |--- name + | | |--- available_instances + | | |--- device_api + | | |--- description + | | |--- [devices] + | |--- [] + | |--- create + | |--- name + | |--- available_instances + | |--- device_api + | |--- description + | |--- [devices] + +* [mdev_supported_types] + + The list of currently supported mediated device types and their details. + + [], device_api, and available_instances are mandatory attributes + that should be provided by vendor driver. + +* [] + + The [] name is created by adding the device driver string as a prefix + to the string provided by the vendor driver. This format of this name is as + follows:: + + sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name); + + (or using mdev_parent_dev(mdev) to arrive at the parent device outside + of the core mdev code) + +* device_api + + This attribute should show which device API is being created, for example, + "vfio-pci" for a PCI device. + +* available_instances + + This attribute should show the number of devices of type that can be + created. + +* [device] + + This directory contains links to the devices of type that have been + created. + +* name + + This attribute should show human readable name. This is optional attribute. + +* description + + This attribute should show brief features/description of the type. This is + optional attribute. + +Directories and Files Under the sysfs for Each mdev Device +---------------------------------------------------------- + +:: + + |- [parent phy device] + |--- [$MDEV_UUID] + |--- remove + |--- mdev_type {link to its type} + |--- vendor-specific-attributes [optional] + +* remove (write only) + +Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can +fail the remove() callback if that device is active and the vendor driver +doesn't support hot unplug. + +Example:: + + # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove + +Mediated device Hot plug +------------------------ + +Mediated devices can be created and assigned at runtime. The procedure to hot +plug a mediated device is the same as the procedure to hot plug a PCI device. + +Translation APIs for Mediated Devices +===================================== + +The following APIs are provided for translating user pfn to host pfn in a VFIO +driver:: + + extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn); + + extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int npage); + +These functions call back into the back-end IOMMU module by using the pin_pages +and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently +these callbacks are supported in the TYPE1 IOMMU module. To enable them for +other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide +these two callback functions. + +Using the Sample Code +===================== + +mtty.c in samples/vfio-mdev/ directory is a sample driver program to +demonstrate how to use the mediated device framework. + +The sample driver creates an mdev device that simulates a serial port over a PCI +card. + +1. Build and load the mtty.ko module. + + This step creates a dummy device, /sys/devices/virtual/mtty/mtty/ + + Files in this device directory in sysfs are similar to the following:: + + # tree /sys/devices/virtual/mtty/mtty/ + /sys/devices/virtual/mtty/mtty/ + |-- mdev_supported_types + | |-- mtty-1 + | | |-- available_instances + | | |-- create + | | |-- device_api + | | |-- devices + | | `-- name + | `-- mtty-2 + | |-- available_instances + | |-- create + | |-- device_api + | |-- devices + | `-- name + |-- mtty_dev + | `-- sample_mtty_dev + |-- power + | |-- autosuspend_delay_ms + | |-- control + | |-- runtime_active_time + | |-- runtime_status + | `-- runtime_suspended_time + |-- subsystem -> ../../../../class/mtty + `-- uevent + +2. Create a mediated device by using the dummy device that you created in the + previous step:: + + # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ + /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create + +3. Add parameters to qemu-kvm:: + + -device vfio-pci,\ + sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 + +4. Boot the VM. + + In the Linux guest VM, with no hardware on the host, the device appears + as follows:: + + # lspci -s 00:05.0 -xxvv + 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) + Subsystem: Device 4348:3253 + Physical Slot: 5 + Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- + Stepping- SERR- FastB2B- DisINTx- + Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- + SERR- Link[LNKA] -> GSI 10 (level, high) -> IRQ 10 + 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A + 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A + + +5. In the Linux guest VM, check the serial ports:: + + # setserial -g /dev/ttyS* + /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4 + /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10 + /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10 + +6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or + /dev/ttyS2 with hardware flow control disabled. + +7. Type data on the minicom terminal or send data to the terminal emulation + program and read the data. + + Data is loop backed from hosts mtty driver. + +8. Destroy the mediated device that you created:: + + # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove + +References +========== + +1. See Documentation/driver-api/vfio.rst for more information on VFIO. +2. struct mdev_driver in include/linux/mdev.h +3. struct mdev_parent_ops in include/linux/mdev.h +4. struct vfio_iommu_driver_ops in include/linux/vfio.h diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst new file mode 100644 index 000000000000..f1a4d3c3ba0b --- /dev/null +++ b/Documentation/driver-api/vfio.rst @@ -0,0 +1,520 @@ +================================== +VFIO - "Virtual Function I/O" [1]_ +================================== + +Many modern system now provide DMA and interrupt remapping facilities +to help ensure I/O devices behave within the boundaries they've been +allotted. This includes x86 hardware with AMD-Vi and Intel VT-d, +POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC +systems such as Freescale PAMU. The VFIO driver is an IOMMU/device +agnostic framework for exposing direct device access to userspace, in +a secure, IOMMU protected environment. In other words, this allows +safe [2]_, non-privileged, userspace drivers. + +Why do we want that? Virtual machines often make use of direct device +access ("device assignment") when configured for the highest possible +I/O performance. From a device and host perspective, this simply +turns the VM into a userspace driver, with the benefits of +significantly reduced latency, higher bandwidth, and direct use of +bare-metal device drivers [3]_. + +Some applications, particularly in the high performance computing +field, also benefit from low-overhead, direct device access from +userspace. Examples include network adapters (often non-TCP/IP based) +and compute accelerators. Prior to VFIO, these drivers had to either +go through the full development cycle to become proper upstream +driver, be maintained out of tree, or make use of the UIO framework, +which has no notion of IOMMU protection, limited interrupt support, +and requires root privileges to access things like PCI configuration +space. + +The VFIO driver framework intends to unify these, replacing both the +KVM PCI specific device assignment code as well as provide a more +secure, more featureful userspace driver environment than UIO. + +Groups, Devices, and IOMMUs +--------------------------- + +Devices are the main target of any I/O driver. Devices typically +create a programming interface made up of I/O access, interrupts, +and DMA. Without going into the details of each of these, DMA is +by far the most critical aspect for maintaining a secure environment +as allowing a device read-write access to system memory imposes the +greatest risk to the overall system integrity. + +To help mitigate this risk, many modern IOMMUs now incorporate +isolation properties into what was, in many cases, an interface only +meant for translation (ie. solving the addressing problems of devices +with limited address spaces). With this, devices can now be isolated +from each other and from arbitrary memory access, thus allowing +things like secure direct assignment of devices into virtual machines. + +This isolation is not always at the granularity of a single device +though. Even when an IOMMU is capable of this, properties of devices, +interconnects, and IOMMU topologies can each reduce this isolation. +For instance, an individual device may be part of a larger multi- +function enclosure. While the IOMMU may be able to distinguish +between devices within the enclosure, the enclosure may not require +transactions between devices to reach the IOMMU. Examples of this +could be anything from a multi-function PCI device with backdoors +between functions to a non-PCI-ACS (Access Control Services) capable +bridge allowing redirection without reaching the IOMMU. Topology +can also play a factor in terms of hiding devices. A PCIe-to-PCI +bridge masks the devices behind it, making transaction appear as if +from the bridge itself. Obviously IOMMU design plays a major factor +as well. + +Therefore, while for the most part an IOMMU may have device level +granularity, any system is susceptible to reduced granularity. The +IOMMU API therefore supports a notion of IOMMU groups. A group is +a set of devices which is isolatable from all other devices in the +system. Groups are therefore the unit of ownership used by VFIO. + +While the group is the minimum granularity that must be used to +ensure secure user access, it's not necessarily the preferred +granularity. In IOMMUs which make use of page tables, it may be +possible to share a set of page tables between different groups, +reducing the overhead both to the platform (reduced TLB thrashing, +reduced duplicate page tables), and to the user (programming only +a single set of translations). For this reason, VFIO makes use of +a container class, which may hold one or more groups. A container +is created by simply opening the /dev/vfio/vfio character device. + +On its own, the container provides little functionality, with all +but a couple version and extension query interfaces locked away. +The user needs to add a group into the container for the next level +of functionality. To do this, the user first needs to identify the +group associated with the desired device. This can be done using +the sysfs links described in the example below. By unbinding the +device from the host driver and binding it to a VFIO driver, a new +VFIO group will appear for the group as /dev/vfio/$GROUP, where +$GROUP is the IOMMU group number of which the device is a member. +If the IOMMU group contains multiple devices, each will need to +be bound to a VFIO driver before operations on the VFIO group +are allowed (it's also sufficient to only unbind the device from +host drivers if a VFIO driver is unavailable; this will make the +group available, but not that particular device). TBD - interface +for disabling driver probing/locking a device. + +Once the group is ready, it may be added to the container by opening +the VFIO group character device (/dev/vfio/$GROUP) and using the +VFIO_GROUP_SET_CONTAINER ioctl, passing the file descriptor of the +previously opened container file. If desired and if the IOMMU driver +supports sharing the IOMMU context between groups, multiple groups may +be set to the same container. If a group fails to set to a container +with existing groups, a new empty container will need to be used +instead. + +With a group (or groups) attached to a container, the remaining +ioctls become available, enabling access to the VFIO IOMMU interfaces. +Additionally, it now becomes possible to get file descriptors for each +device within a group using an ioctl on the VFIO group file descriptor. + +The VFIO device API includes ioctls for describing the device, the I/O +regions and their read/write/mmap offsets on the device descriptor, as +well as mechanisms for describing and registering interrupt +notifications. + +VFIO Usage Example +------------------ + +Assume user wants to access PCI device 0000:06:0d.0:: + + $ readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group + ../../../../kernel/iommu_groups/26 + +This device is therefore in IOMMU group 26. This device is on the +pci bus, therefore the user will make use of vfio-pci to manage the +group:: + + # modprobe vfio-pci + +Binding this device to the vfio-pci driver creates the VFIO group +character devices for this group:: + + $ lspci -n -s 0000:06:0d.0 + 06:0d.0 0401: 1102:0002 (rev 08) + # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind + # echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id + +Now we need to look at what other devices are in the group to free +it for use by VFIO:: + + $ ls -l /sys/bus/pci/devices/0000:06:0d.0/iommu_group/devices + total 0 + lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:00:1e.0 -> + ../../../../devices/pci0000:00/0000:00:1e.0 + lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.0 -> + ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.0 + lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.1 -> + ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.1 + +This device is behind a PCIe-to-PCI bridge [4]_, therefore we also +need to add device 0000:06:0d.1 to the group following the same +procedure as above. Device 0000:00:1e.0 is a bridge that does +not currently have a host driver, therefore it's not required to +bind this device to the vfio-pci driver (vfio-pci does not currently +support PCI bridges). + +The final step is to provide the user with access to the group if +unprivileged operation is desired (note that /dev/vfio/vfio provides +no capabilities on its own and is therefore expected to be set to +mode 0666 by the system):: + + # chown user:user /dev/vfio/26 + +The user now has full access to all the devices and the iommu for this +group and can access them as follows:: + + int container, group, device, i; + struct vfio_group_status group_status = + { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + + /* Create a new container */ + container = open("/dev/vfio/vfio", O_RDWR); + + if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) + /* Unknown API version */ + + if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) + /* Doesn't support the IOMMU driver we want. */ + + /* Open the group */ + group = open("/dev/vfio/26", O_RDWR); + + /* Test the group is viable and available */ + ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) + /* Group is not viable (ie, not all devices bound for vfio) */ + + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); + + /* Get addition IOMMU info */ + ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); + + /* Get a file descriptor for the device */ + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); + + /* Test and setup the device */ + ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); + + for (i = 0; i < device_info.num_regions; i++) { + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + + reg.index = i; + + ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®); + + /* Setup mappings... read/write offsets, mmaps + * For PCI devices, config space is a region */ + } + + for (i = 0; i < device_info.num_irqs; i++) { + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + + irq.index = i; + + ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq); + + /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */ + } + + /* Gratuitous device reset and go... */ + ioctl(device, VFIO_DEVICE_RESET); + +VFIO User API +------------------------------------------------------------------------------- + +Please see include/linux/vfio.h for complete API documentation. + +VFIO bus driver API +------------------------------------------------------------------------------- + +VFIO bus drivers, such as vfio-pci make use of only a few interfaces +into VFIO core. When devices are bound and unbound to the driver, +the driver should call vfio_add_group_dev() and vfio_del_group_dev() +respectively:: + + extern int vfio_add_group_dev(struct device *dev, + const struct vfio_device_ops *ops, + void *device_data); + + extern void *vfio_del_group_dev(struct device *dev); + +vfio_add_group_dev() indicates to the core to begin tracking the +iommu_group of the specified dev and register the dev as owned by +a VFIO bus driver. The driver provides an ops structure for callbacks +similar to a file operations structure:: + + struct vfio_device_ops { + int (*open)(void *device_data); + void (*release)(void *device_data); + ssize_t (*read)(void *device_data, char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*write)(void *device_data, const char __user *buf, + size_t size, loff_t *ppos); + long (*ioctl)(void *device_data, unsigned int cmd, + unsigned long arg); + int (*mmap)(void *device_data, struct vm_area_struct *vma); + }; + +Each function is passed the device_data that was originally registered +in the vfio_add_group_dev() call above. This allows the bus driver +an easy place to store its opaque, private data. The open/release +callbacks are issued when a new file descriptor is created for a +device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides +a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap +interfaces implement the device region access defined by the device's +own VFIO_DEVICE_GET_REGION_INFO ioctl. + + +PPC64 sPAPR implementation note +------------------------------- + +This implementation has some specifics: + +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per + container is supported as an IOMMU table is allocated at the boot time, + one table per a IOMMU group which is a Partitionable Endpoint (PE) + (PE is often a PCI domain but not always). + + Newer systems (POWER8 with IODA2) have improved hardware design which allows + to remove this limitation and have multiple IOMMU groups per a VFIO + container. + +2) The hardware supports so called DMA windows - the PCI address range + within which DMA transfer is allowed, any attempt to access address space + out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API + to map/unmap pages for DMA, and it normally maps 1..32 pages per call and + currently there is no way to reduce the number of calls. In order to make + things faster, the map/unmap handling has been implemented in real mode + which provides an excellent performance which has limitations such as + inability to do locked pages accounting in real time. + +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O + subtree that can be treated as a unit for the purposes of partitioning and + error recovery. A PE may be a single or multi-function IOA (IO Adapter), a + function of a multi-function IOA, or multiple IOAs (possibly including + switch and bridge structures above the multiple IOAs). PPC64 guests detect + PCI errors and recover from them via EEH RTAS services, which works on the + basis of additional ioctl commands. + + So 4 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO + returns the size and the start of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE + enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE + disables the container. + + VFIO_EEH_PE_OP + provides an API for EEH setup, error detection and recovery. + + The code flow from the example above should be slightly changed:: + + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op), .flags = 0 }; + + ..... + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); + + /* Get a file descriptor for the device */ + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); + + .... + + /* Gratuitous device reset and go... */ + ioctl(device, VFIO_DEVICE_RESET); + + /* Make sure EEH is supported */ + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); + + /* Enable the EEH functionality on the device */ + pe_op.op = VFIO_EEH_PE_ENABLE; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* You're suggested to create additional data struct to represent + * PE, and put child devices belonging to same IOMMU group to the + * PE instance for later reference. + */ + + /* Check the PE's state and make sure it's in functional state */ + pe_op.op = VFIO_EEH_PE_GET_STATE; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* Save device state using pci_save_state(). + * EEH should be enabled on the specified device. + */ + + .... + + /* Inject EEH error, which is expected to be caused by 32-bits + * config load. + */ + pe_op.op = VFIO_EEH_PE_INJECT_ERR; + pe_op.err.type = EEH_ERR_TYPE_32; + pe_op.err.func = EEH_ERR_FUNC_LD_CFG_ADDR; + pe_op.err.addr = 0ul; + pe_op.err.mask = 0ul; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + .... + + /* When 0xFF's returned from reading PCI config space or IO BARs + * of the PCI device. Check the PE's state to see if that has been + * frozen. + */ + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* Waiting for pending PCI transactions to be completed and don't + * produce any more PCI traffic from/to the affected PE until + * recovery is finished. + */ + + /* Enable IO for the affected PE and collect logs. Usually, the + * standard part of PCI config space, AER registers are dumped + * as logs for further analysis. + */ + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* + * Issue PE reset: hot or fundamental reset. Usually, hot reset + * is enough. However, the firmware of some PCI adapters would + * require fundamental reset. + */ + pe_op.op = VFIO_EEH_PE_RESET_HOT; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* Configure the PCI bridges for the affected PE */ + pe_op.op = VFIO_EEH_PE_CONFIGURE; + ioctl(container, VFIO_EEH_PE_OP, &pe_op); + + /* Restored state we saved at initialization time. pci_restore_state() + * is good enough as an example. + */ + + /* Hopefully, error is recovered successfully. Now, you can resume to + * start PCI traffic to/from the affected PE. + */ + + .... + +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/ + VFIO_IOMMU_DISABLE and implements 2 new ioctls: + VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY + (which are unsupported in v1 IOMMU). + + PPC64 paravirtualized guests generate a lot of map/unmap requests, + and the handling of those includes pinning/unpinning pages and updating + mm::locked_vm counter to make sure we do not exceed the rlimit. + The v2 IOMMU splits accounting and pinning into separate operations: + + - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls + receive a user space address and size of the block to be pinned. + Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to + be called with the exact address and size used for registering + the memory block. The userspace is not expected to call these often. + The ranges are stored in a linked list in a VFIO container. + + - VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual + IOMMU table and do not do pinning; instead these check that the userspace + address is from pre-registered range. + + This separation helps in optimizing DMA for guests. + +6) sPAPR specification allows guests to have an additional DMA window(s) on + a PCI bus with a variable page size. Two ioctls have been added to support + this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE. + The platform has to support the functionality or error will be returned to + the userspace. The existing hardware supports up to 2 DMA windows, one is + 2GB long, uses 4K pages and called "default 32bit window"; the other can + be as big as entire RAM, use different page size, it is optional - guests + create those in run-time if the guest driver supports 64bit DMA. + + VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and + a number of TCE table levels (if a TCE table is going to be big enough and + the kernel may not be able to allocate enough of physically contiguous + memory). It creates a new window in the available slot and returns the bus + address where the new window starts. Due to hardware limitation, the user + space cannot choose the location of DMA windows. + + VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window + and removes it. + +------------------------------------------------------------------------------- + +.. [1] VFIO was originally an acronym for "Virtual Function I/O" in its + initial implementation by Tom Lyon while as Cisco. We've since + outgrown the acronym, but it's catchy. + +.. [2] "safe" also depends upon a device being "well behaved". It's + possible for multi-function devices to have backdoors between + functions and even for single function devices to have alternative + access to things like PCI config space through MMIO registers. To + guard against the former we can include additional precautions in the + IOMMU driver to group multi-function PCI devices together + (iommu=group_mf). The latter we can't prevent, but the IOMMU should + still provide isolation. For PCI, SR-IOV Virtual Functions are the + best indicator of "well behaved", as these are designed for + virtualization usage models. + +.. [3] As always there are trade-offs to virtual machine device + assignment that are beyond the scope of VFIO. It's expected that + future IOMMU technologies will reduce some, but maybe not all, of + these trade-offs. + +.. [4] In this case the device is below a PCI bridge, so transactions + from either function of the device are indistinguishable to the iommu:: + + -[0000:00]-+-1e.0-[06]--+-0d.0 + \-0d.1 + + 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) diff --git a/Documentation/driver-api/xillybus.rst b/Documentation/driver-api/xillybus.rst new file mode 100644 index 000000000000..2446ee303c09 --- /dev/null +++ b/Documentation/driver-api/xillybus.rst @@ -0,0 +1,379 @@ +========================================== +Xillybus driver for generic FPGA interface +========================================== + +:Author: Eli Billauer, Xillybus Ltd. (http://xillybus.com) +:Email: eli.billauer@gmail.com or as advertised on Xillybus' site. + +.. Contents: + + - Introduction + -- Background + -- Xillybus Overview + + - Usage + -- User interface + -- Synchronization + -- Seekable pipes + + - Internals + -- Source code organization + -- Pipe attributes + -- Host never reads from the FPGA + -- Channels, pipes, and the message channel + -- Data streaming + -- Data granularity + -- Probing + -- Buffer allocation + -- The "nonempty" message (supporting poll) + + +Introduction +============ + +Background +---------- + +An FPGA (Field Programmable Gate Array) is a piece of logic hardware, which +can be programmed to become virtually anything that is usually found as a +dedicated chipset: For instance, a display adapter, network interface card, +or even a processor with its peripherals. FPGAs are the LEGO of hardware: +Based upon certain building blocks, you make your own toys the way you like +them. It's usually pointless to reimplement something that is already +available on the market as a chipset, so FPGAs are mostly used when some +special functionality is needed, and the production volume is relatively low +(hence not justifying the development of an ASIC). + +The challenge with FPGAs is that everything is implemented at a very low +level, even lower than assembly language. In order to allow FPGA designers to +focus on their specific project, and not reinvent the wheel over and over +again, pre-designed building blocks, IP cores, are often used. These are the +FPGA parallels of library functions. IP cores may implement certain +mathematical functions, a functional unit (e.g. a USB interface), an entire +processor (e.g. ARM) or anything that might come handy. Think of them as a +building block, with electrical wires dangling on the sides for connection to +other blocks. + +One of the daunting tasks in FPGA design is communicating with a fullblown +operating system (actually, with the processor running it): Implementing the +low-level bus protocol and the somewhat higher-level interface with the host +(registers, interrupts, DMA etc.) is a project in itself. When the FPGA's +function is a well-known one (e.g. a video adapter card, or a NIC), it can +make sense to design the FPGA's interface logic specifically for the project. +A special driver is then written to present the FPGA as a well-known interface +to the kernel and/or user space. In that case, there is no reason to treat the +FPGA differently than any device on the bus. + +It's however common that the desired data communication doesn't fit any well- +known peripheral function. Also, the effort of designing an elegant +abstraction for the data exchange is often considered too big. In those cases, +a quicker and possibly less elegant solution is sought: The driver is +effectively written as a user space program, leaving the kernel space part +with just elementary data transport. This still requires designing some +interface logic for the FPGA, and write a simple ad-hoc driver for the kernel. + +Xillybus Overview +----------------- + +Xillybus is an IP core and a Linux driver. Together, they form a kit for +elementary data transport between an FPGA and the host, providing pipe-like +data streams with a straightforward user interface. It's intended as a low- +effort solution for mixed FPGA-host projects, for which it makes sense to +have the project-specific part of the driver running in a user-space program. + +Since the communication requirements may vary significantly from one FPGA +project to another (the number of data pipes needed in each direction and +their attributes), there isn't one specific chunk of logic being the Xillybus +IP core. Rather, the IP core is configured and built based upon a +specification given by its end user. + +Xillybus presents independent data streams, which resemble pipes or TCP/IP +communication to the user. At the host side, a character device file is used +just like any pipe file. On the FPGA side, hardware FIFOs are used to stream +the data. This is contrary to a common method of communicating through fixed- +sized buffers (even though such buffers are used by Xillybus under the hood). +There may be more than a hundred of these streams on a single IP core, but +also no more than one, depending on the configuration. + +In order to ease the deployment of the Xillybus IP core, it contains a simple +data structure which completely defines the core's configuration. The Linux +driver fetches this data structure during its initialization process, and sets +up the DMA buffers and character devices accordingly. As a result, a single +driver is used to work out of the box with any Xillybus IP core. + +The data structure just mentioned should not be confused with PCI's +configuration space or the Flattened Device Tree. + +Usage +===== + +User interface +-------------- + +On the host, all interface with Xillybus is done through /dev/xillybus_* +device files, which are generated automatically as the drivers loads. The +names of these files depend on the IP core that is loaded in the FPGA (see +Probing below). To communicate with the FPGA, open the device file that +corresponds to the hardware FIFO you want to send data or receive data from, +and use plain write() or read() calls, just like with a regular pipe. In +particular, it makes perfect sense to go:: + + $ cat mydata > /dev/xillybus_thisfifo + + $ cat /dev/xillybus_thatfifo > hisdata + +possibly pressing CTRL-C as some stage, even though the xillybus_* pipes have +the capability to send an EOF (but may not use it). + +The driver and hardware are designed to behave sensibly as pipes, including: + +* Supporting non-blocking I/O (by setting O_NONBLOCK on open() ). + +* Supporting poll() and select(). + +* Being bandwidth efficient under load (using DMA) but also handle small + pieces of data sent across (like TCP/IP) by autoflushing. + +A device file can be read only, write only or bidirectional. Bidirectional +device files are treated like two independent pipes (except for sharing a +"channel" structure in the implementation code). + +Synchronization +--------------- + +Xillybus pipes are configured (on the IP core) to be either synchronous or +asynchronous. For a synchronous pipe, write() returns successfully only after +some data has been submitted and acknowledged by the FPGA. This slows down +bulk data transfers, and is nearly impossible for use with streams that +require data at a constant rate: There is no data transmitted to the FPGA +between write() calls, in particular when the process loses the CPU. + +When a pipe is configured asynchronous, write() returns if there was enough +room in the buffers to store any of the data in the buffers. + +For FPGA to host pipes, asynchronous pipes allow data transfer from the FPGA +as soon as the respective device file is opened, regardless of if the data +has been requested by a read() call. On synchronous pipes, only the amount +of data requested by a read() call is transmitted. + +In summary, for synchronous pipes, data between the host and FPGA is +transmitted only to satisfy the read() or write() call currently handled +by the driver, and those calls wait for the transmission to complete before +returning. + +Note that the synchronization attribute has nothing to do with the possibility +that read() or write() completes less bytes than requested. There is a +separate configuration flag ("allowpartial") that determines whether such a +partial completion is allowed. + +Seekable pipes +-------------- + +A synchronous pipe can be configured to have the stream's position exposed +to the user logic at the FPGA. Such a pipe is also seekable on the host API. +With this feature, a memory or register interface can be attached on the +FPGA side to the seekable stream. Reading or writing to a certain address in +the attached memory is done by seeking to the desired address, and calling +read() or write() as required. + + +Internals +========= + +Source code organization +------------------------ + +The Xillybus driver consists of a core module, xillybus_core.c, and modules +that depend on the specific bus interface (xillybus_of.c and xillybus_pcie.c). + +The bus specific modules are those probed when a suitable device is found by +the kernel. Since the DMA mapping and synchronization functions, which are bus +dependent by their nature, are used by the core module, a +xilly_endpoint_hardware structure is passed to the core module on +initialization. This structure is populated with pointers to wrapper functions +which execute the DMA-related operations on the bus. + +Pipe attributes +--------------- + +Each pipe has a number of attributes which are set when the FPGA component +(IP core) is built. They are fetched from the IDT (the data structure which +defines the core's configuration, see Probing below) by xilly_setupchannels() +in xillybus_core.c as follows: + +* is_writebuf: The pipe's direction. A non-zero value means it's an FPGA to + host pipe (the FPGA "writes"). + +* channelnum: The pipe's identification number in communication between the + host and FPGA. + +* format: The underlying data width. See Data Granularity below. + +* allowpartial: A non-zero value means that a read() or write() (whichever + applies) may return with less than the requested number of bytes. The common + choice is a non-zero value, to match standard UNIX behavior. + +* synchronous: A non-zero value means that the pipe is synchronous. See + Synchronization above. + +* bufsize: Each DMA buffer's size. Always a power of two. + +* bufnum: The number of buffers allocated for this pipe. Always a power of two. + +* exclusive_open: A non-zero value forces exclusive opening of the associated + device file. If the device file is bidirectional, and already opened only in + one direction, the opposite direction may be opened once. + +* seekable: A non-zero value indicates that the pipe is seekable. See + Seekable pipes above. + +* supports_nonempty: A non-zero value (which is typical) indicates that the + hardware will send the messages that are necessary to support select() and + poll() for this pipe. + +Host never reads from the FPGA +------------------------------ + +Even though PCI Express is hotpluggable in general, a typical motherboard +doesn't expect a card to go away all of the sudden. But since the PCIe card +is based upon reprogrammable logic, a sudden disappearance from the bus is +quite likely as a result of an accidental reprogramming of the FPGA while the +host is up. In practice, nothing happens immediately in such a situation. But +if the host attempts to read from an address that is mapped to the PCI Express +device, that leads to an immediate freeze of the system on some motherboards, +even though the PCIe standard requires a graceful recovery. + +In order to avoid these freezes, the Xillybus driver refrains completely from +reading from the device's register space. All communication from the FPGA to +the host is done through DMA. In particular, the Interrupt Service Routine +doesn't follow the common practice of checking a status register when it's +invoked. Rather, the FPGA prepares a small buffer which contains short +messages, which inform the host what the interrupt was about. + +This mechanism is used on non-PCIe buses as well for the sake of uniformity. + + +Channels, pipes, and the message channel +---------------------------------------- + +Each of the (possibly bidirectional) pipes presented to the user is allocated +a data channel between the FPGA and the host. The distinction between channels +and pipes is necessary only because of channel 0, which is used for interrupt- +related messages from the FPGA, and has no pipe attached to it. + +Data streaming +-------------- + +Even though a non-segmented data stream is presented to the user at both +sides, the implementation relies on a set of DMA buffers which is allocated +for each channel. For the sake of illustration, let's take the FPGA to host +direction: As data streams into the respective channel's interface in the +FPGA, the Xillybus IP core writes it to one of the DMA buffers. When the +buffer is full, the FPGA informs the host about that (appending a +XILLYMSG_OPCODE_RELEASEBUF message channel 0 and sending an interrupt if +necessary). The host responds by making the data available for reading through +the character device. When all data has been read, the host writes on the +the FPGA's buffer control register, allowing the buffer's overwriting. Flow +control mechanisms exist on both sides to prevent underflows and overflows. + +This is not good enough for creating a TCP/IP-like stream: If the data flow +stops momentarily before a DMA buffer is filled, the intuitive expectation is +that the partial data in buffer will arrive anyhow, despite the buffer not +being completed. This is implemented by adding a field in the +XILLYMSG_OPCODE_RELEASEBUF message, through which the FPGA informs not just +which buffer is submitted, but how much data it contains. + +But the FPGA will submit a partially filled buffer only if directed to do so +by the host. This situation occurs when the read() method has been blocking +for XILLY_RX_TIMEOUT jiffies (currently 10 ms), after which the host commands +the FPGA to submit a DMA buffer as soon as it can. This timeout mechanism +balances between bus bandwidth efficiency (preventing a lot of partially +filled buffers being sent) and a latency held fairly low for tails of data. + +A similar setting is used in the host to FPGA direction. The handling of +partial DMA buffers is somewhat different, though. The user can tell the +driver to submit all data it has in the buffers to the FPGA, by issuing a +write() with the byte count set to zero. This is similar to a flush request, +but it doesn't block. There is also an autoflushing mechanism, which triggers +an equivalent flush roughly XILLY_RX_TIMEOUT jiffies after the last write(). +This allows the user to be oblivious about the underlying buffering mechanism +and yet enjoy a stream-like interface. + +Note that the issue of partial buffer flushing is irrelevant for pipes having +the "synchronous" attribute nonzero, since synchronous pipes don't allow data +to lay around in the DMA buffers between read() and write() anyhow. + +Data granularity +---------------- + +The data arrives or is sent at the FPGA as 8, 16 or 32 bit wide words, as +configured by the "format" attribute. Whenever possible, the driver attempts +to hide this when the pipe is accessed differently from its natural alignment. +For example, reading single bytes from a pipe with 32 bit granularity works +with no issues. Writing single bytes to pipes with 16 or 32 bit granularity +will also work, but the driver can't send partially completed words to the +FPGA, so the transmission of up to one word may be held until it's fully +occupied with user data. + +This somewhat complicates the handling of host to FPGA streams, because +when a buffer is flushed, it may contain up to 3 bytes don't form a word in +the FPGA, and hence can't be sent. To prevent loss of data, these leftover +bytes need to be moved to the next buffer. The parts in xillybus_core.c +that mention "leftovers" in some way are related to this complication. + +Probing +------- + +As mentioned earlier, the number of pipes that are created when the driver +loads and their attributes depend on the Xillybus IP core in the FPGA. During +the driver's initialization, a blob containing configuration info, the +Interface Description Table (IDT), is sent from the FPGA to the host. The +bootstrap process is done in three phases: + +1. Acquire the length of the IDT, so a buffer can be allocated for it. This + is done by sending a quiesce command to the device, since the acknowledge + for this command contains the IDT's buffer length. + +2. Acquire the IDT itself. + +3. Create the interfaces according to the IDT. + +Buffer allocation +----------------- + +In order to simplify the logic that prevents illegal boundary crossings of +PCIe packets, the following rule applies: If a buffer is smaller than 4kB, +it must not cross a 4kB boundary. Otherwise, it must be 4kB aligned. The +xilly_setupchannels() functions allocates these buffers by requesting whole +pages from the kernel, and diving them into DMA buffers as necessary. Since +all buffers' sizes are powers of two, it's possible to pack any set of such +buffers, with a maximal waste of one page of memory. + +All buffers are allocated when the driver is loaded. This is necessary, +since large continuous physical memory segments are sometimes requested, +which are more likely to be available when the system is freshly booted. + +The allocation of buffer memory takes place in the same order they appear in +the IDT. The driver relies on a rule that the pipes are sorted with decreasing +buffer size in the IDT. If a requested buffer is larger or equal to a page, +the necessary number of pages is requested from the kernel, and these are +used for this buffer. If the requested buffer is smaller than a page, one +single page is requested from the kernel, and that page is partially used. +Or, if there already is a partially used page at hand, the buffer is packed +into that page. It can be shown that all pages requested from the kernel +(except possibly for the last) are 100% utilized this way. + +The "nonempty" message (supporting poll) +---------------------------------------- + +In order to support the "poll" method (and hence select() ), there is a small +catch regarding the FPGA to host direction: The FPGA may have filled a DMA +buffer with some data, but not submitted that buffer. If the host waited for +the buffer's submission by the FPGA, there would be a possibility that the +FPGA side has sent data, but a select() call would still block, because the +host has not received any notification about this. This is solved with +XILLYMSG_OPCODE_NONEMPTY messages sent by the FPGA when a channel goes from +completely empty to containing some data. + +These messages are used only to support poll() and select(). The IP core can +be configured not to send them for a slight reduction of bandwidth. diff --git a/Documentation/driver-api/zorro.rst b/Documentation/driver-api/zorro.rst new file mode 100644 index 000000000000..664072b017e3 --- /dev/null +++ b/Documentation/driver-api/zorro.rst @@ -0,0 +1,104 @@ +======================================== +Writing Device Drivers for Zorro Devices +======================================== + +:Author: Written by Geert Uytterhoeven +:Last revised: September 5, 2003 + + +Introduction +------------ + +The Zorro bus is the bus used in the Amiga family of computers. Thanks to +AutoConfig(tm), it's 100% Plug-and-Play. + +There are two types of Zorro buses, Zorro II and Zorro III: + + - The Zorro II address space is 24-bit and lies within the first 16 MB of the + Amiga's address map. + + - Zorro III is a 32-bit extension of Zorro II, which is backwards compatible + with Zorro II. The Zorro III address space lies outside the first 16 MB. + + +Probing for Zorro Devices +------------------------- + +Zorro devices are found by calling ``zorro_find_device()``, which returns a +pointer to the ``next`` Zorro device with the specified Zorro ID. A probe loop +for the board with Zorro ID ``ZORRO_PROD_xxx`` looks like:: + + struct zorro_dev *z = NULL; + + while ((z = zorro_find_device(ZORRO_PROD_xxx, z))) { + if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE, + "My explanation")) + ... + } + +``ZORRO_WILDCARD`` acts as a wildcard and finds any Zorro device. If your driver +supports different types of boards, you can use a construct like:: + + struct zorro_dev *z = NULL; + + while ((z = zorro_find_device(ZORRO_WILDCARD, z))) { + if (z->id != ZORRO_PROD_xxx1 && z->id != ZORRO_PROD_xxx2 && ...) + continue; + if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE, + "My explanation")) + ... + } + + +Zorro Resources +--------------- + +Before you can access a Zorro device's registers, you have to make sure it's +not yet in use. This is done using the I/O memory space resource management +functions:: + + request_mem_region() + release_mem_region() + +Shortcuts to claim the whole device's address space are provided as well:: + + zorro_request_device + zorro_release_device + + +Accessing the Zorro Address Space +--------------------------------- + +The address regions in the Zorro device resources are Zorro bus address +regions. Due to the identity bus-physical address mapping on the Zorro bus, +they are CPU physical addresses as well. + +The treatment of these regions depends on the type of Zorro space: + + - Zorro II address space is always mapped and does not have to be mapped + explicitly using z_ioremap(). + + Conversion from bus/physical Zorro II addresses to kernel virtual addresses + and vice versa is done using:: + + virt_addr = ZTWO_VADDR(bus_addr); + bus_addr = ZTWO_PADDR(virt_addr); + + - Zorro III address space must be mapped explicitly using z_ioremap() first + before it can be accessed:: + + virt_addr = z_ioremap(bus_addr, size); + ... + z_iounmap(virt_addr); + + +References +---------- + +#. linux/include/linux/zorro.h +#. linux/include/uapi/linux/zorro.h +#. linux/include/uapi/linux/zorro_ids.h +#. linux/arch/m68k/include/asm/zorro.h +#. linux/drivers/zorro +#. /proc/bus/zorro + diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt deleted file mode 100644 index c07565ba57da..000000000000 --- a/Documentation/eisa.txt +++ /dev/null @@ -1,230 +0,0 @@ -================ -EISA bus support -================ - -:Author: Marc Zyngier - -This document groups random notes about porting EISA drivers to the -new EISA/sysfs API. - -Starting from version 2.5.59, the EISA bus is almost given the same -status as other much more mainstream busses such as PCI or USB. This -has been possible through sysfs, which defines a nice enough set of -abstractions to manage busses, devices and drivers. - -Although the new API is quite simple to use, converting existing -drivers to the new infrastructure is not an easy task (mostly because -detection code is generally also used to probe ISA cards). Moreover, -most EISA drivers are among the oldest Linux drivers so, as you can -imagine, some dust has settled here over the years. - -The EISA infrastructure is made up of three parts: - - - The bus code implements most of the generic code. It is shared - among all the architectures that the EISA code runs on. It - implements bus probing (detecting EISA cards available on the bus), - allocates I/O resources, allows fancy naming through sysfs, and - offers interfaces for driver to register. - - - The bus root driver implements the glue between the bus hardware - and the generic bus code. It is responsible for discovering the - device implementing the bus, and setting it up to be latter probed - by the bus code. This can go from something as simple as reserving - an I/O region on x86, to the rather more complex, like the hppa - EISA code. This is the part to implement in order to have EISA - running on an "new" platform. - - - The driver offers the bus a list of devices that it manages, and - implements the necessary callbacks to probe and release devices - whenever told to. - -Every function/structure below lives in , which depends -heavily on . - -Bus root driver -=============== - -:: - - int eisa_root_register (struct eisa_root_device *root); - -The eisa_root_register function is used to declare a device as the -root of an EISA bus. The eisa_root_device structure holds a reference -to this device, as well as some parameters for probing purposes:: - - struct eisa_root_device { - struct device *dev; /* Pointer to bridge device */ - struct resource *res; - unsigned long bus_base_addr; - int slots; /* Max slot number */ - int force_probe; /* Probe even when no slot 0 */ - u64 dma_mask; /* from bridge device */ - int bus_nr; /* Set by eisa_root_register */ - struct resource eisa_root_res; /* ditto */ - }; - -============= ====================================================== -node used for eisa_root_register internal purpose -dev pointer to the root device -res root device I/O resource -bus_base_addr slot 0 address on this bus -slots max slot number to probe -force_probe Probe even when slot 0 is empty (no EISA mainboard) -dma_mask Default DMA mask. Usually the bridge device dma_mask. -bus_nr unique bus id, set by eisa_root_register -============= ====================================================== - -Driver -====== - -:: - - int eisa_driver_register (struct eisa_driver *edrv); - void eisa_driver_unregister (struct eisa_driver *edrv); - -Clear enough ? - -:: - - struct eisa_device_id { - char sig[EISA_SIG_LEN]; - unsigned long driver_data; - }; - - struct eisa_driver { - const struct eisa_device_id *id_table; - struct device_driver driver; - }; - -=============== ==================================================== -id_table an array of NULL terminated EISA id strings, - followed by an empty string. Each string can - optionally be paired with a driver-dependent value - (driver_data). - -driver a generic driver, such as described in - Documentation/driver-api/driver-model/driver.rst. Only .name, - .probe and .remove members are mandatory. -=============== ==================================================== - -An example is the 3c59x driver:: - - static struct eisa_device_id vortex_eisa_ids[] = { - { "TCM5920", EISA_3C592_OFFSET }, - { "TCM5970", EISA_3C597_OFFSET }, - { "" } - }; - - static struct eisa_driver vortex_eisa_driver = { - .id_table = vortex_eisa_ids, - .driver = { - .name = "3c59x", - .probe = vortex_eisa_probe, - .remove = vortex_eisa_remove - } - }; - -Device -====== - -The sysfs framework calls .probe and .remove functions upon device -discovery and removal (note that the .remove function is only called -when driver is built as a module). - -Both functions are passed a pointer to a 'struct device', which is -encapsulated in a 'struct eisa_device' described as follows:: - - struct eisa_device { - struct eisa_device_id id; - int slot; - int state; - unsigned long base_addr; - struct resource res[EISA_MAX_RESOURCES]; - u64 dma_mask; - struct device dev; /* generic device */ - }; - -======== ============================================================ -id EISA id, as read from device. id.driver_data is set from the - matching driver EISA id. -slot slot number which the device was detected on -state set of flags indicating the state of the device. Current - flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED. -res set of four 256 bytes I/O regions allocated to this device -dma_mask DMA mask set from the parent device. -dev generic device (see Documentation/driver-api/driver-model/device.rst) -======== ============================================================ - -You can get the 'struct eisa_device' from 'struct device' using the -'to_eisa_device' macro. - -Misc stuff -========== - -:: - - void eisa_set_drvdata (struct eisa_device *edev, void *data); - -Stores data into the device's driver_data area. - -:: - - void *eisa_get_drvdata (struct eisa_device *edev): - -Gets the pointer previously stored into the device's driver_data area. - -:: - - int eisa_get_region_index (void *addr); - -Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given -address. - -Kernel parameters -================= - -eisa_bus.enable_dev - A comma-separated list of slots to be enabled, even if the firmware - set the card as disabled. The driver must be able to properly - initialize the device in such conditions. - -eisa_bus.disable_dev - A comma-separated list of slots to be enabled, even if the firmware - set the card as enabled. The driver won't be called to handle this - device. - -virtual_root.force_probe - Force the probing code to probe EISA slots even when it cannot find an - EISA compliant mainboard (nothing appears on slot 0). Defaults to 0 - (don't force), and set to 1 (force probing) when either - CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. - -Random notes -============ - -Converting an EISA driver to the new API mostly involves *deleting* -code (since probing is now in the core EISA code). Unfortunately, most -drivers share their probing routine between ISA, and EISA. Special -care must be taken when ripping out the EISA code, so other busses -won't suffer from these surgical strikes... - -You *must not* expect any EISA device to be detected when returning -from eisa_driver_register, since the chances are that the bus has not -yet been probed. In fact, that's what happens most of the time (the -bus root driver usually kicks in rather late in the boot process). -Unfortunately, most drivers are doing the probing by themselves, and -expect to have explored the whole machine when they exit their probe -routine. - -For example, switching your favorite EISA SCSI card to the "hotplug" -model is "the right thing"(tm). - -Thanks -====== - -I'd like to thank the following people for their help: - -- Xavier Benigni for lending me a wonderful Alpha Jensen, -- James Bottomley, Jeff Garzik for getting this stuff into the kernel, -- Andries Brouwer for contributing numerous EISA ids, -- Catrin Jones for coping with far too many machines at home. diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst index 26bc5cdaabab..ebca41785abe 100644 --- a/Documentation/fb/fbcon.rst +++ b/Documentation/fb/fbcon.rst @@ -187,7 +187,7 @@ the hardware. Thus, in a VGA console:: Assuming the VGA driver can be unloaded, one must first unbind the VGA driver from the console layer before unloading the driver. The VGA driver cannot be unloaded if it is still bound to the console layer. (See -Documentation/console/console.rst for more information). +Documentation/driver-api/console.rst for more information). This is more complicated in the case of the framebuffer console (fbcon), because fbcon is an intermediate layer between the console and the drivers:: @@ -204,7 +204,7 @@ fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from fbcon. So, how do we unbind fbcon from the console? Part of the answer is in -Documentation/console/console.rst. To summarize: +Documentation/driver-api/console.rst. To summarize: Echo a value to the bind file that represents the framebuffer console driver. So assuming vtcon1 represents fbcon, then:: diff --git a/Documentation/isa.txt b/Documentation/isa.txt deleted file mode 100644 index def4a7b690b5..000000000000 --- a/Documentation/isa.txt +++ /dev/null @@ -1,122 +0,0 @@ -=========== -ISA Drivers -=========== - -The following text is adapted from the commit message of the initial -commit of the ISA bus driver authored by Rene Herman. - -During the recent "isa drivers using platform devices" discussion it was -pointed out that (ALSA) ISA drivers ran into the problem of not having -the option to fail driver load (device registration rather) upon not -finding their hardware due to a probe() error not being passed up -through the driver model. In the course of that, I suggested a separate -ISA bus might be best; Russell King agreed and suggested this bus could -use the .match() method for the actual device discovery. - -The attached does this. For this old non (generically) discoverable ISA -hardware only the driver itself can do discovery so as a difference with -the platform_bus, this isa_bus also distributes match() up to the -driver. - -As another difference: these devices only exist in the driver model due -to the driver creating them because it might want to drive them, meaning -that all device creation has been made internal as well. - -The usage model this provides is nice, and has been acked from the ALSA -side by Takashi Iwai and Jaroslav Kysela. The ALSA driver module_init's -now (for oldisa-only drivers) become:: - - static int __init alsa_card_foo_init(void) - { - return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS); - } - - static void __exit alsa_card_foo_exit(void) - { - isa_unregister_driver(&snd_foo_isa_driver); - } - -Quite like the other bus models therefore. This removes a lot of -duplicated init code from the ALSA ISA drivers. - -The passed in isa_driver struct is the regular driver struct embedding a -struct device_driver, the normal probe/remove/shutdown/suspend/resume -callbacks, and as indicated that .match callback. - -The "SNDRV_CARDS" you see being passed in is a "unsigned int ndev" -parameter, indicating how many devices to create and call our methods -with. - -The platform_driver callbacks are called with a platform_device param; -the isa_driver callbacks are being called with a ``struct device *dev, -unsigned int id`` pair directly -- with the device creation completely -internal to the bus it's much cleaner to not leak isa_dev's by passing -them in at all. The id is the only thing we ever want other then the -struct device anyways, and it makes for nicer code in the callbacks as -well. - -With this additional .match() callback ISA drivers have all options. If -ALSA would want to keep the old non-load behaviour, it could stick all -of the old .probe in .match, which would only keep them registered after -everything was found to be present and accounted for. If it wanted the -behaviour of always loading as it inadvertently did for a bit after the -changeover to platform devices, it could just not provide a .match() and -do everything in .probe() as before. - -If it, as Takashi Iwai already suggested earlier as a way of following -the model from saner buses more closely, wants to load when a later bind -could conceivably succeed, it could use .match() for the prerequisites -(such as checking the user wants the card enabled and that port/irq/dma -values have been passed in) and .probe() for everything else. This is -the nicest model. - -To the code... - -This exports only two functions; isa_{,un}register_driver(). - -isa_register_driver() register's the struct device_driver, and then -loops over the passed in ndev creating devices and registering them. -This causes the bus match method to be called for them, which is:: - - int isa_bus_match(struct device *dev, struct device_driver *driver) - { - struct isa_driver *isa_driver = to_isa_driver(driver); - - if (dev->platform_data == isa_driver) { - if (!isa_driver->match || - isa_driver->match(dev, to_isa_dev(dev)->id)) - return 1; - dev->platform_data = NULL; - } - return 0; - } - -The first thing this does is check if this device is in fact one of this -driver's devices by seeing if the device's platform_data pointer is set -to this driver. Platform devices compare strings, but we don't need to -do that with everything being internal, so isa_register_driver() abuses -dev->platform_data as a isa_driver pointer which we can then check here. -I believe platform_data is available for this, but if rather not, moving -the isa_driver pointer to the private struct isa_dev is ofcourse fine as -well. - -Then, if the the driver did not provide a .match, it matches. If it did, -the driver match() method is called to determine a match. - -If it did **not** match, dev->platform_data is reset to indicate this to -isa_register_driver which can then unregister the device again. - -If during all this, there's any error, or no devices matched at all -everything is backed out again and the error, or -ENODEV, is returned. - -isa_unregister_driver() just unregisters the matched devices and the -driver itself. - -module_isa_driver is a helper macro for ISA drivers which do not do -anything special in module init/exit. This eliminates a lot of -boilerplate code. Each module may only use this macro once, and calling -it replaces module_init and module_exit. - -max_num_isa_dev is a macro to determine the maximum possible number of -ISA devices which may be registered in the I/O port address space given -the address extent of the ISA devices. diff --git a/Documentation/isapnp.txt b/Documentation/isapnp.txt deleted file mode 100644 index 8d0840ac847b..000000000000 --- a/Documentation/isapnp.txt +++ /dev/null @@ -1,15 +0,0 @@ -========================================================== -ISA Plug & Play support by Jaroslav Kysela -========================================================== - -Interface /proc/isapnp -====================== - -The interface has been removed. See pnp.txt for more details. - -Interface /proc/bus/isapnp -========================== - -This directory allows access to ISA PnP cards and logical devices. -The regular files contain the contents of ISA PnP registers for -a logical device. diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt deleted file mode 100644 index 1040ed1cec81..000000000000 --- a/Documentation/lightnvm/pblk.txt +++ /dev/null @@ -1,21 +0,0 @@ -pblk: Physical Block Device Target -================================== - -pblk implements a fully associative, host-based FTL that exposes a traditional -block I/O interface. Its primary responsibilities are: - - - Map logical addresses onto physical addresses (4KB granularity) in a - logical-to-physical (L2P) table. - - Maintain the integrity and consistency of the L2P table as well as its - recovery from normal tear down and power outage. - - Deal with controller- and media-specific constrains. - - Handle I/O errors. - - Implement garbage collection. - - Maintain consistency across the I/O stack during synchronization points. - -For more information please refer to: - - http://lightnvm.io - -which maintains updated FAQs, manual pages, technical documentation, tools, -contacts, etc. diff --git a/Documentation/men-chameleon-bus.txt b/Documentation/men-chameleon-bus.txt deleted file mode 100644 index 1b1f048aa748..000000000000 --- a/Documentation/men-chameleon-bus.txt +++ /dev/null @@ -1,175 +0,0 @@ -================= -MEN Chameleon Bus -================= - -.. Table of Contents - ================= - 1 Introduction - 1.1 Scope of this Document - 1.2 Limitations of the current implementation - 2 Architecture - 2.1 MEN Chameleon Bus - 2.2 Carrier Devices - 2.3 Parser - 3 Resource handling - 3.1 Memory Resources - 3.2 IRQs - 4 Writing an MCB driver - 4.1 The driver structure - 4.2 Probing and attaching - 4.3 Initializing the driver - - -Introduction -============ - -This document describes the architecture and implementation of the MEN -Chameleon Bus (called MCB throughout this document). - -Scope of this Document ----------------------- - -This document is intended to be a short overview of the current -implementation and does by no means describe the complete possibilities of MCB -based devices. - -Limitations of the current implementation ------------------------------------------ - -The current implementation is limited to PCI and PCIe based carrier devices -that only use a single memory resource and share the PCI legacy IRQ. Not -implemented are: - -- Multi-resource MCB devices like the VME Controller or M-Module carrier. -- MCB devices that need another MCB device, like SRAM for a DMA Controller's - buffer descriptors or a video controller's video memory. -- A per-carrier IRQ domain for carrier devices that have one (or more) IRQs - per MCB device like PCIe based carriers with MSI or MSI-X support. - -Architecture -============ - -MCB is divided into 3 functional blocks: - -- The MEN Chameleon Bus itself, -- drivers for MCB Carrier Devices and -- the parser for the Chameleon table. - -MEN Chameleon Bus ------------------ - -The MEN Chameleon Bus is an artificial bus system that attaches to a so -called Chameleon FPGA device found on some hardware produced my MEN Mikro -Elektronik GmbH. These devices are multi-function devices implemented in a -single FPGA and usually attached via some sort of PCI or PCIe link. Each -FPGA contains a header section describing the content of the FPGA. The -header lists the device id, PCI BAR, offset from the beginning of the PCI -BAR, size in the FPGA, interrupt number and some other properties currently -not handled by the MCB implementation. - -Carrier Devices ---------------- - -A carrier device is just an abstraction for the real world physical bus the -Chameleon FPGA is attached to. Some IP Core drivers may need to interact with -properties of the carrier device (like querying the IRQ number of a PCI -device). To provide abstraction from the real hardware bus, an MCB carrier -device provides callback methods to translate the driver's MCB function calls -to hardware related function calls. For example a carrier device may -implement the get_irq() method which can be translated into a hardware bus -query for the IRQ number the device should use. - -Parser ------- - -The parser reads the first 512 bytes of a Chameleon device and parses the -Chameleon table. Currently the parser only supports the Chameleon v2 variant -of the Chameleon table but can easily be adopted to support an older or -possible future variant. While parsing the table's entries new MCB devices -are allocated and their resources are assigned according to the resource -assignment in the Chameleon table. After resource assignment is finished, the -MCB devices are registered at the MCB and thus at the driver core of the -Linux kernel. - -Resource handling -================= - -The current implementation assigns exactly one memory and one IRQ resource -per MCB device. But this is likely going to change in the future. - -Memory Resources ----------------- - -Each MCB device has exactly one memory resource, which can be requested from -the MCB bus. This memory resource is the physical address of the MCB device -inside the carrier and is intended to be passed to ioremap() and friends. It -is already requested from the kernel by calling request_mem_region(). - -IRQs ----- - -Each MCB device has exactly one IRQ resource, which can be requested from the -MCB bus. If a carrier device driver implements the ->get_irq() callback -method, the IRQ number assigned by the carrier device will be returned, -otherwise the IRQ number inside the Chameleon table will be returned. This -number is suitable to be passed to request_irq(). - -Writing an MCB driver -===================== - -The driver structure --------------------- - -Each MCB driver has a structure to identify the device driver as well as -device ids which identify the IP Core inside the FPGA. The driver structure -also contains callback methods which get executed on driver probe and -removal from the system:: - - static const struct mcb_device_id foo_ids[] = { - { .device = 0x123 }, - { } - }; - MODULE_DEVICE_TABLE(mcb, foo_ids); - - static struct mcb_driver foo_driver = { - driver = { - .name = "foo-bar", - .owner = THIS_MODULE, - }, - .probe = foo_probe, - .remove = foo_remove, - .id_table = foo_ids, - }; - -Probing and attaching ---------------------- - -When a driver is loaded and the MCB devices it services are found, the MCB -core will call the driver's probe callback method. When the driver is removed -from the system, the MCB core will call the driver's remove callback method:: - - static init foo_probe(struct mcb_device *mdev, const struct mcb_device_id *id); - static void foo_remove(struct mcb_device *mdev); - -Initializing the driver ------------------------ - -When the kernel is booted or your foo driver module is inserted, you have to -perform driver initialization. Usually it is enough to register your driver -module at the MCB core:: - - static int __init foo_init(void) - { - return mcb_register_driver(&foo_driver); - } - module_init(foo_init); - - static void __exit foo_exit(void) - { - mcb_unregister_driver(&foo_driver); - } - module_exit(foo_exit); - -The module_mcb_driver() macro can be used to reduce the above code:: - - module_mcb_driver(foo_driver); diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt deleted file mode 100644 index 074a423c853c..000000000000 --- a/Documentation/ntb.txt +++ /dev/null @@ -1,236 +0,0 @@ -=========== -NTB Drivers -=========== - -NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects -the separate memory systems of two or more computers to the same PCI-Express -fabric. Existing NTB hardware supports a common feature set: doorbell -registers and memory translation windows, as well as non common features like -scratchpad and message registers. Scratchpad registers are read-and-writable -registers that are accessible from either side of the device, so that peers can -exchange a small amount of information at a fixed address. Message registers can -be utilized for the same purpose. Additionally they are provided with with -special status bits to make sure the information isn't rewritten by another -peer. Doorbell registers provide a way for peers to send interrupt events. -Memory windows allow translated read and write access to the peer memory. - -NTB Core Driver (ntb) -===================== - -The NTB core driver defines an api wrapping the common feature set, and allows -clients interested in NTB features to discover NTB the devices supported by -hardware drivers. The term "client" is used here to mean an upper layer -component making use of the NTB api. The term "driver," or "hardware driver," -is used here to mean a driver for a specific vendor and model of NTB hardware. - -NTB Client Drivers -================== - -NTB client drivers should register with the NTB core driver. After -registering, the client probe and remove functions will be called appropriately -as ntb hardware, or hardware drivers, are inserted and removed. The -registration uses the Linux Device framework, so it should feel familiar to -anyone who has written a pci driver. - -NTB Typical client driver implementation ----------------------------------------- - -Primary purpose of NTB is to share some peace of memory between at least two -systems. So the NTB device features like Scratchpad/Message registers are -mainly used to perform the proper memory window initialization. Typically -there are two types of memory window interfaces supported by the NTB API: -inbound translation configured on the local ntb port and outbound translation -configured by the peer, on the peer ntb port. The first type is -depicted on the next figure:: - - Inbound translation: - - Memory: Local NTB Port: Peer NTB Port: Peer MMIO: - ____________ - | dma-mapped |-ntb_mw_set_trans(addr) | - | memory | _v____________ | ______________ - | (addr) |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO - |------------| |--------------| | |--------------| - -So typical scenario of the first type memory window initialization looks: -1) allocate a memory region, 2) put translated address to NTB config, -3) somehow notify a peer device of performed initialization, 4) peer device -maps corresponding outbound memory window so to have access to the shared -memory region. - -The second type of interface, that implies the shared windows being -initialized by a peer device, is depicted on the figure:: - - Outbound translation: - - Memory: Local NTB Port: Peer NTB Port: Peer MMIO: - ____________ ______________ - | dma-mapped | | | MW base addr |<== memory-mapped IO - | memory | | |--------------| - | (addr) |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr) - |------------| | |--------------| - -Typical scenario of the second type interface initialization would be: -1) allocate a memory region, 2) somehow deliver a translated address to a peer -device, 3) peer puts the translated address to NTB config, 4) peer device maps -outbound memory window so to have access to the shared memory region. - -As one can see the described scenarios can be combined in one portable -algorithm. - - Local device: - 1) Allocate memory for a shared window - 2) Initialize memory window by translated address of the allocated region - (it may fail if local memory window initialization is unsupported) - 3) Send the translated address and memory window index to a peer device - - Peer device: - 1) Initialize memory window with retrieved address of the allocated - by another device memory region (it may fail if peer memory window - initialization is unsupported) - 2) Map outbound memory window - -In accordance with this scenario, the NTB Memory Window API can be used as -follows: - - Local device: - 1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can - be allocated for memory windows between local device and peer device - of port with specified index. - 2) ntb_get_align(pidx, midx) - retrieve parameters restricting the - shared memory region alignment and size. Then memory can be properly - allocated. - 3) Allocate physically contiguous memory region in compliance with - restrictions retrieved in 2). - 4) ntb_mw_set_trans(pidx, midx) - try to set translation address of - the memory window with specified index for the defined peer device - (it may fail if local translated address setting is not supported) - 5) Send translated base address (usually together with memory window - number) to the peer device using, for instance, scratchpad or message - registers. - - Peer device: - 1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other - device (related to pidx) translated address for specified memory - window. It may fail if retrieved address, for instance, exceeds - maximum possible address or isn't properly aligned. - 2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory - window so to have an access to the shared memory. - -Also it is worth to note, that method ntb_mw_count(pidx) should return the -same value as ntb_peer_mw_count() on the peer with port index - pidx. - -NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev) ------------------------------------------------------------------- - -The primary client for NTB is the Transport client, used in tandem with NTB -Netdev. These drivers function together to create a logical link to the peer, -across the ntb, to exchange packets of network data. The Transport client -establishes a logical link to the peer, and creates queue pairs to exchange -messages and data. The NTB Netdev then creates an ethernet device using a -Transport queue pair. Network data is copied between socket buffers and the -Transport queue pair buffer. The Transport client may be used for other things -besides Netdev, however no other applications have yet been written. - -NTB Ping Pong Test Client (ntb\_pingpong) ------------------------------------------ - -The Ping Pong test client serves as a demonstration to exercise the doorbell -and scratchpad registers of NTB hardware, and as an example simple NTB client. -Ping Pong enables the link when started, waits for the NTB link to come up, and -then proceeds to read and write the doorbell scratchpad registers of the NTB. -The peers interrupt each other using a bit mask of doorbell bits, which is -shifted by one in each round, to test the behavior of multiple doorbell bits -and interrupt vectors. The Ping Pong driver also reads the first local -scratchpad, and writes the value plus one to the first peer scratchpad, each -round before writing the peer doorbell register. - -Module Parameters: - -* unsafe - Some hardware has known issues with scratchpad and doorbell - registers. By default, Ping Pong will not attempt to exercise such - hardware. You may override this behavior at your own risk by setting - unsafe=1. -* delay\_ms - Specify the delay between receiving a doorbell - interrupt event and setting the peer doorbell register for the next - round. -* init\_db - Specify the doorbell bits to start new series of rounds. A new - series begins once all the doorbell bits have been shifted out of - range. -* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and - then to observe debugging output on the console. - -NTB Tool Test Client (ntb\_tool) --------------------------------- - -The Tool test client serves for debugging, primarily, ntb hardware and drivers. -The Tool provides access through debugfs for reading, setting, and clearing the -NTB doorbell, and reading and writing scratchpads. - -The Tool does not currently have any module parameters. - -Debugfs Files: - -* *debugfs*/ntb\_tool/*hw*/ - A directory in debugfs will be created for each - NTB device probed by the tool. This directory is shortened to *hw* - below. -* *hw*/db - This file is used to read, set, and clear the local doorbell. Not - all operations may be supported by all hardware. To read the doorbell, - read the file. To set the doorbell, write `s` followed by the bits to - set (eg: `echo 's 0x0101' > db`). To clear the doorbell, write `c` - followed by the bits to clear. -* *hw*/mask - This file is used to read, set, and clear the local doorbell mask. - See *db* for details. -* *hw*/peer\_db - This file is used to read, set, and clear the peer doorbell. - See *db* for details. -* *hw*/peer\_mask - This file is used to read, set, and clear the peer doorbell - mask. See *db* for details. -* *hw*/spad - This file is used to read and write local scratchpads. To read - the values of all scratchpads, read the file. To write values, write a - series of pairs of scratchpad number and value - (eg: `echo '4 0x123 7 0xabc' > spad` - # to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively). -* *hw*/peer\_spad - This file is used to read and write peer scratchpads. See - *spad* for details. - -NTB Hardware Drivers -==================== - -NTB hardware drivers should register devices with the NTB core driver. After -registering, clients probe and remove functions will be called. - -NTB Intel Hardware Driver (ntb\_hw\_intel) ------------------------------------------- - -The Intel hardware driver supports NTB on Xeon and Atom CPUs. - -Module Parameters: - -* b2b\_mw\_idx - If the peer ntb is to be accessed via a memory window, then use - this memory window to access the peer ntb. A value of zero or positive - starts from the first mw idx, and a negative value starts from the last - mw idx. Both sides MUST set the same value here! The default value is - `-1`. -* b2b\_mw\_share - If the peer ntb is to be accessed via a memory window, and if - the memory window is large enough, still allow the client to use the - second half of the memory window for address translation to the peer. -* xeon\_b2b\_usd\_bar2\_addr64 - If using B2B topology on Xeon hardware, use - this 64 bit address on the bus between the NTB devices for the window - at BAR2, on the upstream side of the link. -* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*. -* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*. diff --git a/Documentation/nvmem/nvmem.rst b/Documentation/nvmem/nvmem.rst deleted file mode 100644 index 3866b6e066d5..000000000000 --- a/Documentation/nvmem/nvmem.rst +++ /dev/null @@ -1,189 +0,0 @@ -:orphan: - -=============== -NVMEM Subsystem -=============== - - Srinivas Kandagatla - -This document explains the NVMEM Framework along with the APIs provided, -and how to use it. - -1. Introduction -=============== -*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to -retrieve configuration of SOC or Device specific data from non volatile -memories like eeprom, efuses and so on. - -Before this framework existed, NVMEM drivers like eeprom were stored in -drivers/misc, where they all had to duplicate pretty much the same code to -register a sysfs file, allow in-kernel users to access the content of the -devices they were driving, etc. - -This was also a problem as far as other in-kernel users were involved, since -the solutions used were pretty much different from one driver to another, there -was a rather big abstraction leak. - -This framework aims at solve these problems. It also introduces DT -representation for consumer devices to go get the data they require (MAC -Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This -framework is based on regmap, so that most of the abstraction available in -regmap can be reused, across multiple types of buses. - -NVMEM Providers -+++++++++++++++ - -NVMEM provider refers to an entity that implements methods to initialize, read -and write the non-volatile memory. - -2. Registering/Unregistering the NVMEM provider -=============================================== - -A NVMEM provider can register with NVMEM core by supplying relevant -nvmem configuration to nvmem_register(), on success core would return a valid -nvmem_device pointer. - -nvmem_unregister(nvmem) is used to unregister a previously registered provider. - -For example, a simple qfprom case:: - - static struct nvmem_config econfig = { - .name = "qfprom", - .owner = THIS_MODULE, - }; - - static int qfprom_probe(struct platform_device *pdev) - { - ... - econfig.dev = &pdev->dev; - nvmem = nvmem_register(&econfig); - ... - } - -It is mandatory that the NVMEM provider has a regmap associated with its -struct device. Failure to do would return error code from nvmem_register(). - -Users of board files can define and register nvmem cells using the -nvmem_cell_table struct:: - - static struct nvmem_cell_info foo_nvmem_cells[] = { - { - .name = "macaddr", - .offset = 0x7f00, - .bytes = ETH_ALEN, - } - }; - - static struct nvmem_cell_table foo_nvmem_cell_table = { - .nvmem_name = "i2c-eeprom", - .cells = foo_nvmem_cells, - .ncells = ARRAY_SIZE(foo_nvmem_cells), - }; - - nvmem_add_cell_table(&foo_nvmem_cell_table); - -Additionally it is possible to create nvmem cell lookup entries and register -them with the nvmem framework from machine code as shown in the example below:: - - static struct nvmem_cell_lookup foo_nvmem_lookup = { - .nvmem_name = "i2c-eeprom", - .cell_name = "macaddr", - .dev_id = "foo_mac.0", - .con_id = "mac-address", - }; - - nvmem_add_cell_lookups(&foo_nvmem_lookup, 1); - -NVMEM Consumers -+++++++++++++++ - -NVMEM consumers are the entities which make use of the NVMEM provider to -read from and to NVMEM. - -3. NVMEM cell based consumer APIs -================================= - -NVMEM cells are the data entries/fields in the NVMEM. -The NVMEM framework provides 3 APIs to read/write NVMEM cells:: - - struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name); - struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name); - - void nvmem_cell_put(struct nvmem_cell *cell); - void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell); - - void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len); - int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len); - -`*nvmem_cell_get()` apis will get a reference to nvmem cell for a given id, -and nvmem_cell_read/write() can then read or write to the cell. -Once the usage of the cell is finished the consumer should call -`*nvmem_cell_put()` to free all the allocation memory for the cell. - -4. Direct NVMEM device based consumer APIs -========================================== - -In some instances it is necessary to directly read/write the NVMEM. -To facilitate such consumers NVMEM framework provides below apis:: - - struct nvmem_device *nvmem_device_get(struct device *dev, const char *name); - struct nvmem_device *devm_nvmem_device_get(struct device *dev, - const char *name); - void nvmem_device_put(struct nvmem_device *nvmem); - int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset, - size_t bytes, void *buf); - int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset, - size_t bytes, void *buf); - int nvmem_device_cell_read(struct nvmem_device *nvmem, - struct nvmem_cell_info *info, void *buf); - int nvmem_device_cell_write(struct nvmem_device *nvmem, - struct nvmem_cell_info *info, void *buf); - -Before the consumers can read/write NVMEM directly, it should get hold -of nvmem_controller from one of the `*nvmem_device_get()` api. - -The difference between these apis and cell based apis is that these apis always -take nvmem_device as parameter. - -5. Releasing a reference to the NVMEM -===================================== - -When a consumer no longer needs the NVMEM, it has to release the reference -to the NVMEM it has obtained using the APIs mentioned in the above section. -The NVMEM framework provides 2 APIs to release a reference to the NVMEM:: - - void nvmem_cell_put(struct nvmem_cell *cell); - void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell); - void nvmem_device_put(struct nvmem_device *nvmem); - void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem); - -Both these APIs are used to release a reference to the NVMEM and -devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated -with this NVMEM. - -Userspace -+++++++++ - -6. Userspace binary interface -============================== - -Userspace can read/write the raw NVMEM file located at:: - - /sys/bus/nvmem/devices/*/nvmem - -ex:: - - hexdump /sys/bus/nvmem/devices/qfprom0/nvmem - - 0000000 0000 0000 0000 0000 0000 0000 0000 0000 - * - 00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00 - 0000000 0000 0000 0000 0000 0000 0000 0000 0000 - ... - * - 0001000 - -7. DeviceTree Binding -===================== - -See Documentation/devicetree/bindings/nvmem/nvmem.txt diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt deleted file mode 100644 index 0633d70ffda7..000000000000 --- a/Documentation/parport-lowlevel.txt +++ /dev/null @@ -1,1832 +0,0 @@ -=============================== -PARPORT interface documentation -=============================== - -:Time-stamp: <2000-02-24 13:30:20 twaugh> - -Described here are the following functions: - -Global functions:: - parport_register_driver - parport_unregister_driver - parport_enumerate - parport_register_device - parport_unregister_device - parport_claim - parport_claim_or_block - parport_release - parport_yield - parport_yield_blocking - parport_wait_peripheral - parport_poll_peripheral - parport_wait_event - parport_negotiate - parport_read - parport_write - parport_open - parport_close - parport_device_id - parport_device_coords - parport_find_class - parport_find_device - parport_set_timeout - -Port functions (can be overridden by low-level drivers): - - SPP:: - port->ops->read_data - port->ops->write_data - port->ops->read_status - port->ops->read_control - port->ops->write_control - port->ops->frob_control - port->ops->enable_irq - port->ops->disable_irq - port->ops->data_forward - port->ops->data_reverse - - EPP:: - port->ops->epp_write_data - port->ops->epp_read_data - port->ops->epp_write_addr - port->ops->epp_read_addr - - ECP:: - port->ops->ecp_write_data - port->ops->ecp_read_data - port->ops->ecp_write_addr - - Other:: - port->ops->nibble_read_data - port->ops->byte_read_data - port->ops->compat_write_data - -The parport subsystem comprises ``parport`` (the core port-sharing -code), and a variety of low-level drivers that actually do the port -accesses. Each low-level driver handles a particular style of port -(PC, Amiga, and so on). - -The parport interface to the device driver author can be broken down -into global functions and port functions. - -The global functions are mostly for communicating between the device -driver and the parport subsystem: acquiring a list of available ports, -claiming a port for exclusive use, and so on. They also include -``generic`` functions for doing standard things that will work on any -IEEE 1284-capable architecture. - -The port functions are provided by the low-level drivers, although the -core parport module provides generic ``defaults`` for some routines. -The port functions can be split into three groups: SPP, EPP, and ECP. - -SPP (Standard Parallel Port) functions modify so-called ``SPP`` -registers: data, status, and control. The hardware may not actually -have registers exactly like that, but the PC does and this interface is -modelled after common PC implementations. Other low-level drivers may -be able to emulate most of the functionality. - -EPP (Enhanced Parallel Port) functions are provided for reading and -writing in IEEE 1284 EPP mode, and ECP (Extended Capabilities Port) -functions are used for IEEE 1284 ECP mode. (What about BECP? Does -anyone care?) - -Hardware assistance for EPP and/or ECP transfers may or may not be -available, and if it is available it may or may not be used. If -hardware is not used, the transfer will be software-driven. In order -to cope with peripherals that only tenuously support IEEE 1284, a -low-level driver specific function is provided, for altering 'fudge -factors'. - -Global functions -================ - -parport_register_driver - register a device driver with parport ---------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_driver { - const char *name; - void (*attach) (struct parport *); - void (*detach) (struct parport *); - struct parport_driver *next; - }; - int parport_register_driver (struct parport_driver *driver); - -DESCRIPTION -^^^^^^^^^^^ - -In order to be notified about parallel ports when they are detected, -parport_register_driver should be called. Your driver will -immediately be notified of all ports that have already been detected, -and of each new port as low-level drivers are loaded. - -A ``struct parport_driver`` contains the textual name of your driver, -a pointer to a function to handle new ports, and a pointer to a -function to handle ports going away due to a low-level driver -unloading. Ports will only be detached if they are not being used -(i.e. there are no devices registered on them). - -The visible parts of the ``struct parport *`` argument given to -attach/detach are:: - - struct parport - { - struct parport *next; /* next parport in list */ - const char *name; /* port's name */ - unsigned int modes; /* bitfield of hardware modes */ - struct parport_device_info probe_info; - /* IEEE1284 info */ - int number; /* parport index */ - struct parport_operations *ops; - ... - }; - -There are other members of the structure, but they should not be -touched. - -The ``modes`` member summarises the capabilities of the underlying -hardware. It consists of flags which may be bitwise-ored together: - - ============================= =============================================== - PARPORT_MODE_PCSPP IBM PC registers are available, - i.e. functions that act on data, - control and status registers are - probably writing directly to the - hardware. - PARPORT_MODE_TRISTATE The data drivers may be turned off. - This allows the data lines to be used - for reverse (peripheral to host) - transfers. - PARPORT_MODE_COMPAT The hardware can assist with - compatibility-mode (printer) - transfers, i.e. compat_write_block. - PARPORT_MODE_EPP The hardware can assist with EPP - transfers. - PARPORT_MODE_ECP The hardware can assist with ECP - transfers. - PARPORT_MODE_DMA The hardware can use DMA, so you might - want to pass ISA DMA-able memory - (i.e. memory allocated using the - GFP_DMA flag with kmalloc) to the - low-level driver in order to take - advantage of it. - ============================= =============================================== - -There may be other flags in ``modes`` as well. - -The contents of ``modes`` is advisory only. For example, if the -hardware is capable of DMA, and PARPORT_MODE_DMA is in ``modes``, it -doesn't necessarily mean that DMA will always be used when possible. -Similarly, hardware that is capable of assisting ECP transfers won't -necessarily be used. - -RETURN VALUE -^^^^^^^^^^^^ - -Zero on success, otherwise an error code. - -ERRORS -^^^^^^ - -None. (Can it fail? Why return int?) - -EXAMPLE -^^^^^^^ - -:: - - static void lp_attach (struct parport *port) - { - ... - private = kmalloc (...); - dev[count++] = parport_register_device (...); - ... - } - - static void lp_detach (struct parport *port) - { - ... - } - - static struct parport_driver lp_driver = { - "lp", - lp_attach, - lp_detach, - NULL /* always put NULL here */ - }; - - int lp_init (void) - { - ... - if (parport_register_driver (&lp_driver)) { - /* Failed; nothing we can do. */ - return -EIO; - } - ... - } - - -SEE ALSO -^^^^^^^^ - -parport_unregister_driver, parport_register_device, parport_enumerate - - - -parport_unregister_driver - tell parport to forget about this driver --------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_driver { - const char *name; - void (*attach) (struct parport *); - void (*detach) (struct parport *); - struct parport_driver *next; - }; - void parport_unregister_driver (struct parport_driver *driver); - -DESCRIPTION -^^^^^^^^^^^ - -This tells parport not to notify the device driver of new ports or of -ports going away. Registered devices belonging to that driver are NOT -unregistered: parport_unregister_device must be used for each one. - -EXAMPLE -^^^^^^^ - -:: - - void cleanup_module (void) - { - ... - /* Stop notifications. */ - parport_unregister_driver (&lp_driver); - - /* Unregister devices. */ - for (i = 0; i < NUM_DEVS; i++) - parport_unregister_device (dev[i]); - ... - } - -SEE ALSO -^^^^^^^^ - -parport_register_driver, parport_enumerate - - - -parport_enumerate - retrieve a list of parallel ports (DEPRECATED) ------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport *parport_enumerate (void); - -DESCRIPTION -^^^^^^^^^^^ - -Retrieve the first of a list of valid parallel ports for this machine. -Successive parallel ports can be found using the ``struct parport -*next`` element of the ``struct parport *`` that is returned. If ``next`` -is NULL, there are no more parallel ports in the list. The number of -ports in the list will not exceed PARPORT_MAX. - -RETURN VALUE -^^^^^^^^^^^^ - -A ``struct parport *`` describing a valid parallel port for the machine, -or NULL if there are none. - -ERRORS -^^^^^^ - -This function can return NULL to indicate that there are no parallel -ports to use. - -EXAMPLE -^^^^^^^ - -:: - - int detect_device (void) - { - struct parport *port; - - for (port = parport_enumerate (); - port != NULL; - port = port->next) { - /* Try to detect a device on the port... */ - ... - } - } - - ... - } - -NOTES -^^^^^ - -parport_enumerate is deprecated; parport_register_driver should be -used instead. - -SEE ALSO -^^^^^^^^ - -parport_register_driver, parport_unregister_driver - - - -parport_register_device - register to use a port ------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - typedef int (*preempt_func) (void *handle); - typedef void (*wakeup_func) (void *handle); - typedef int (*irq_func) (int irq, void *handle, struct pt_regs *); - - struct pardevice *parport_register_device(struct parport *port, - const char *name, - preempt_func preempt, - wakeup_func wakeup, - irq_func irq, - int flags, - void *handle); - -DESCRIPTION -^^^^^^^^^^^ - -Use this function to register your device driver on a parallel port -(``port``). Once you have done that, you will be able to use -parport_claim and parport_release in order to use the port. - -The (``name``) argument is the name of the device that appears in /proc -filesystem. The string must be valid for the whole lifetime of the -device (until parport_unregister_device is called). - -This function will register three callbacks into your driver: -``preempt``, ``wakeup`` and ``irq``. Each of these may be NULL in order to -indicate that you do not want a callback. - -When the ``preempt`` function is called, it is because another driver -wishes to use the parallel port. The ``preempt`` function should return -non-zero if the parallel port cannot be released yet -- if zero is -returned, the port is lost to another driver and the port must be -re-claimed before use. - -The ``wakeup`` function is called once another driver has released the -port and no other driver has yet claimed it. You can claim the -parallel port from within the ``wakeup`` function (in which case the -claim is guaranteed to succeed), or choose not to if you don't need it -now. - -If an interrupt occurs on the parallel port your driver has claimed, -the ``irq`` function will be called. (Write something about shared -interrupts here.) - -The ``handle`` is a pointer to driver-specific data, and is passed to -the callback functions. - -``flags`` may be a bitwise combination of the following flags: - - ===================== ================================================= - Flag Meaning - ===================== ================================================= - PARPORT_DEV_EXCL The device cannot share the parallel port at all. - Use this only when absolutely necessary. - ===================== ================================================= - -The typedefs are not actually defined -- they are only shown in order -to make the function prototype more readable. - -The visible parts of the returned ``struct pardevice`` are:: - - struct pardevice { - struct parport *port; /* Associated port */ - void *private; /* Device driver's 'handle' */ - ... - }; - -RETURN VALUE -^^^^^^^^^^^^ - -A ``struct pardevice *``: a handle to the registered parallel port -device that can be used for parport_claim, parport_release, etc. - -ERRORS -^^^^^^ - -A return value of NULL indicates that there was a problem registering -a device on that port. - -EXAMPLE -^^^^^^^ - -:: - - static int preempt (void *handle) - { - if (busy_right_now) - return 1; - - must_reclaim_port = 1; - return 0; - } - - static void wakeup (void *handle) - { - struct toaster *private = handle; - struct pardevice *dev = private->dev; - if (!dev) return; /* avoid races */ - - if (want_port) - parport_claim (dev); - } - - static int toaster_detect (struct toaster *private, struct parport *port) - { - private->dev = parport_register_device (port, "toaster", preempt, - wakeup, NULL, 0, - private); - if (!private->dev) - /* Couldn't register with parport. */ - return -EIO; - - must_reclaim_port = 0; - busy_right_now = 1; - parport_claim_or_block (private->dev); - ... - /* Don't need the port while the toaster warms up. */ - busy_right_now = 0; - ... - busy_right_now = 1; - if (must_reclaim_port) { - parport_claim_or_block (private->dev); - must_reclaim_port = 0; - } - ... - } - -SEE ALSO -^^^^^^^^ - -parport_unregister_device, parport_claim - - - -parport_unregister_device - finish using a port ------------------------------------------------ - -SYNPOPSIS - -:: - - #include - - void parport_unregister_device (struct pardevice *dev); - -DESCRIPTION -^^^^^^^^^^^ - -This function is the opposite of parport_register_device. After using -parport_unregister_device, ``dev`` is no longer a valid device handle. - -You should not unregister a device that is currently claimed, although -if you do it will be released automatically. - -EXAMPLE -^^^^^^^ - -:: - - ... - kfree (dev->private); /* before we lose the pointer */ - parport_unregister_device (dev); - ... - -SEE ALSO -^^^^^^^^ - - -parport_unregister_driver - -parport_claim, parport_claim_or_block - claim the parallel port for a device ----------------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_claim (struct pardevice *dev); - int parport_claim_or_block (struct pardevice *dev); - -DESCRIPTION -^^^^^^^^^^^ - -These functions attempt to gain control of the parallel port on which -``dev`` is registered. ``parport_claim`` does not block, but -``parport_claim_or_block`` may do. (Put something here about blocking -interruptibly or non-interruptibly.) - -You should not try to claim a port that you have already claimed. - -RETURN VALUE -^^^^^^^^^^^^ - -A return value of zero indicates that the port was successfully -claimed, and the caller now has possession of the parallel port. - -If ``parport_claim_or_block`` blocks before returning successfully, the -return value is positive. - -ERRORS -^^^^^^ - -========== ========================================================== - -EAGAIN The port is unavailable at the moment, but another attempt - to claim it may succeed. -========== ========================================================== - -SEE ALSO -^^^^^^^^ - - -parport_release - -parport_release - release the parallel port -------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - void parport_release (struct pardevice *dev); - -DESCRIPTION -^^^^^^^^^^^ - -Once a parallel port device has been claimed, it can be released using -``parport_release``. It cannot fail, but you should not release a -device that you do not have possession of. - -EXAMPLE -^^^^^^^ - -:: - - static size_t write (struct pardevice *dev, const void *buf, - size_t len) - { - ... - written = dev->port->ops->write_ecp_data (dev->port, buf, - len); - parport_release (dev); - ... - } - - -SEE ALSO -^^^^^^^^ - -change_mode, parport_claim, parport_claim_or_block, parport_yield - - - -parport_yield, parport_yield_blocking - temporarily release a parallel port ---------------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_yield (struct pardevice *dev) - int parport_yield_blocking (struct pardevice *dev); - -DESCRIPTION -^^^^^^^^^^^ - -When a driver has control of a parallel port, it may allow another -driver to temporarily ``borrow`` it. ``parport_yield`` does not block; -``parport_yield_blocking`` may do. - -RETURN VALUE -^^^^^^^^^^^^ - -A return value of zero indicates that the caller still owns the port -and the call did not block. - -A positive return value from ``parport_yield_blocking`` indicates that -the caller still owns the port and the call blocked. - -A return value of -EAGAIN indicates that the caller no longer owns the -port, and it must be re-claimed before use. - -ERRORS -^^^^^^ - -========= ========================================================== - -EAGAIN Ownership of the parallel port was given away. -========= ========================================================== - -SEE ALSO -^^^^^^^^ - -parport_release - - - -parport_wait_peripheral - wait for status lines, up to 35ms ------------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_wait_peripheral (struct parport *port, - unsigned char mask, - unsigned char val); - -DESCRIPTION -^^^^^^^^^^^ - -Wait for the status lines in mask to match the values in val. - -RETURN VALUE -^^^^^^^^^^^^ - -======== ========================================================== - -EINTR a signal is pending - 0 the status lines in mask have values in val - 1 timed out while waiting (35ms elapsed) -======== ========================================================== - -SEE ALSO -^^^^^^^^ - -parport_poll_peripheral - - - -parport_poll_peripheral - wait for status lines, in usec --------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_poll_peripheral (struct parport *port, - unsigned char mask, - unsigned char val, - int usec); - -DESCRIPTION -^^^^^^^^^^^ - -Wait for the status lines in mask to match the values in val. - -RETURN VALUE -^^^^^^^^^^^^ - -======== ========================================================== - -EINTR a signal is pending - 0 the status lines in mask have values in val - 1 timed out while waiting (usec microseconds have elapsed) -======== ========================================================== - -SEE ALSO -^^^^^^^^ - -parport_wait_peripheral - - - -parport_wait_event - wait for an event on a port ------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_wait_event (struct parport *port, signed long timeout) - -DESCRIPTION -^^^^^^^^^^^ - -Wait for an event (e.g. interrupt) on a port. The timeout is in -jiffies. - -RETURN VALUE -^^^^^^^^^^^^ - -======= ========================================================== - 0 success - <0 error (exit as soon as possible) - >0 timed out -======= ========================================================== - -parport_negotiate - perform IEEE 1284 negotiation -------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_negotiate (struct parport *, int mode); - -DESCRIPTION -^^^^^^^^^^^ - -Perform IEEE 1284 negotiation. - -RETURN VALUE -^^^^^^^^^^^^ - -======= ========================================================== - 0 handshake OK; IEEE 1284 peripheral and mode available - -1 handshake failed; peripheral not compliant (or none present) - 1 handshake OK; IEEE 1284 peripheral present but mode not - available -======= ========================================================== - -SEE ALSO -^^^^^^^^ - -parport_read, parport_write - - - -parport_read - read data from device ------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - ssize_t parport_read (struct parport *, void *buf, size_t len); - -DESCRIPTION -^^^^^^^^^^^ - -Read data from device in current IEEE 1284 transfer mode. This only -works for modes that support reverse data transfer. - -RETURN VALUE -^^^^^^^^^^^^ - -If negative, an error code; otherwise the number of bytes transferred. - -SEE ALSO -^^^^^^^^ - -parport_write, parport_negotiate - - - -parport_write - write data to device ------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - ssize_t parport_write (struct parport *, const void *buf, size_t len); - -DESCRIPTION -^^^^^^^^^^^ - -Write data to device in current IEEE 1284 transfer mode. This only -works for modes that support forward data transfer. - -RETURN VALUE -^^^^^^^^^^^^ - -If negative, an error code; otherwise the number of bytes transferred. - -SEE ALSO -^^^^^^^^ - -parport_read, parport_negotiate - - - -parport_open - register device for particular device number ------------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct pardevice *parport_open (int devnum, const char *name, - int (*pf) (void *), - void (*kf) (void *), - void (*irqf) (int, void *, - struct pt_regs *), - int flags, void *handle); - -DESCRIPTION -^^^^^^^^^^^ - -This is like parport_register_device but takes a device number instead -of a pointer to a struct parport. - -RETURN VALUE -^^^^^^^^^^^^ - -See parport_register_device. If no device is associated with devnum, -NULL is returned. - -SEE ALSO -^^^^^^^^ - -parport_register_device - - - -parport_close - unregister device for particular device number --------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - void parport_close (struct pardevice *dev); - -DESCRIPTION -^^^^^^^^^^^ - -This is the equivalent of parport_unregister_device for parport_open. - -SEE ALSO -^^^^^^^^ - -parport_unregister_device, parport_open - - - -parport_device_id - obtain IEEE 1284 Device ID ----------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - ssize_t parport_device_id (int devnum, char *buffer, size_t len); - -DESCRIPTION -^^^^^^^^^^^ - -Obtains the IEEE 1284 Device ID associated with a given device. - -RETURN VALUE -^^^^^^^^^^^^ - -If negative, an error code; otherwise, the number of bytes of buffer -that contain the device ID. The format of the device ID is as -follows:: - - [length][ID] - -The first two bytes indicate the inclusive length of the entire Device -ID, and are in big-endian order. The ID is a sequence of pairs of the -form:: - - key:value; - -NOTES -^^^^^ - -Many devices have ill-formed IEEE 1284 Device IDs. - -SEE ALSO -^^^^^^^^ - -parport_find_class, parport_find_device - - - -parport_device_coords - convert device number to device coordinates -------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_device_coords (int devnum, int *parport, int *mux, - int *daisy); - -DESCRIPTION -^^^^^^^^^^^ - -Convert between device number (zero-based) and device coordinates -(port, multiplexor, daisy chain address). - -RETURN VALUE -^^^^^^^^^^^^ - -Zero on success, in which case the coordinates are (``*parport``, ``*mux``, -``*daisy``). - -SEE ALSO -^^^^^^^^ - -parport_open, parport_device_id - - - -parport_find_class - find a device by its class ------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - typedef enum { - PARPORT_CLASS_LEGACY = 0, /* Non-IEEE1284 device */ - PARPORT_CLASS_PRINTER, - PARPORT_CLASS_MODEM, - PARPORT_CLASS_NET, - PARPORT_CLASS_HDC, /* Hard disk controller */ - PARPORT_CLASS_PCMCIA, - PARPORT_CLASS_MEDIA, /* Multimedia device */ - PARPORT_CLASS_FDC, /* Floppy disk controller */ - PARPORT_CLASS_PORTS, - PARPORT_CLASS_SCANNER, - PARPORT_CLASS_DIGCAM, - PARPORT_CLASS_OTHER, /* Anything else */ - PARPORT_CLASS_UNSPEC, /* No CLS field in ID */ - PARPORT_CLASS_SCSIADAPTER - } parport_device_class; - - int parport_find_class (parport_device_class cls, int from); - -DESCRIPTION -^^^^^^^^^^^ - -Find a device by class. The search starts from device number from+1. - -RETURN VALUE -^^^^^^^^^^^^ - -The device number of the next device in that class, or -1 if no such -device exists. - -NOTES -^^^^^ - -Example usage:: - - int devnum = -1; - while ((devnum = parport_find_class (PARPORT_CLASS_DIGCAM, devnum)) != -1) { - struct pardevice *dev = parport_open (devnum, ...); - ... - } - -SEE ALSO -^^^^^^^^ - -parport_find_device, parport_open, parport_device_id - - - -parport_find_device - find a device by its class ------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - int parport_find_device (const char *mfg, const char *mdl, int from); - -DESCRIPTION -^^^^^^^^^^^ - -Find a device by vendor and model. The search starts from device -number from+1. - -RETURN VALUE -^^^^^^^^^^^^ - -The device number of the next device matching the specifications, or --1 if no such device exists. - -NOTES -^^^^^ - -Example usage:: - - int devnum = -1; - while ((devnum = parport_find_device ("IOMEGA", "ZIP+", devnum)) != -1) { - struct pardevice *dev = parport_open (devnum, ...); - ... - } - -SEE ALSO -^^^^^^^^ - -parport_find_class, parport_open, parport_device_id - - - -parport_set_timeout - set the inactivity timeout ------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - long parport_set_timeout (struct pardevice *dev, long inactivity); - -DESCRIPTION -^^^^^^^^^^^ - -Set the inactivity timeout, in jiffies, for a registered device. The -previous timeout is returned. - -RETURN VALUE -^^^^^^^^^^^^ - -The previous timeout, in jiffies. - -NOTES -^^^^^ - -Some of the port->ops functions for a parport may take time, owing to -delays at the peripheral. After the peripheral has not responded for -``inactivity`` jiffies, a timeout will occur and the blocking function -will return. - -A timeout of 0 jiffies is a special case: the function must do as much -as it can without blocking or leaving the hardware in an unknown -state. If port operations are performed from within an interrupt -handler, for instance, a timeout of 0 jiffies should be used. - -Once set for a registered device, the timeout will remain at the set -value until set again. - -SEE ALSO -^^^^^^^^ - -port->ops->xxx_read/write_yyy - - - - -PORT FUNCTIONS -============== - -The functions in the port->ops structure (struct parport_operations) -are provided by the low-level driver responsible for that port. - -port->ops->read_data - read the data register ---------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - unsigned char (*read_data) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -If port->modes contains the PARPORT_MODE_TRISTATE flag and the -PARPORT_CONTROL_DIRECTION bit in the control register is set, this -returns the value on the data pins. If port->modes contains the -PARPORT_MODE_TRISTATE flag and the PARPORT_CONTROL_DIRECTION bit is -not set, the return value _may_ be the last value written to the data -register. Otherwise the return value is undefined. - -SEE ALSO -^^^^^^^^ - -write_data, read_status, write_control - - - -port->ops->write_data - write the data register ------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*write_data) (struct parport *port, unsigned char d); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes to the data register. May have side-effects (a STROBE pulse, -for instance). - -SEE ALSO -^^^^^^^^ - -read_data, read_status, write_control - - - -port->ops->read_status - read the status register -------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - unsigned char (*read_status) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads from the status register. This is a bitmask: - -- PARPORT_STATUS_ERROR (printer fault, "nFault") -- PARPORT_STATUS_SELECT (on-line, "Select") -- PARPORT_STATUS_PAPEROUT (no paper, "PError") -- PARPORT_STATUS_ACK (handshake, "nAck") -- PARPORT_STATUS_BUSY (busy, "Busy") - -There may be other bits set. - -SEE ALSO -^^^^^^^^ - -read_data, write_data, write_control - - - -port->ops->read_control - read the control register ---------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - unsigned char (*read_control) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Returns the last value written to the control register (either from -write_control or frob_control). No port access is performed. - -SEE ALSO -^^^^^^^^ - -read_data, write_data, read_status, write_control - - - -port->ops->write_control - write the control register ------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*write_control) (struct parport *port, unsigned char s); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes to the control register. This is a bitmask:: - - _______ - - PARPORT_CONTROL_STROBE (nStrobe) - _______ - - PARPORT_CONTROL_AUTOFD (nAutoFd) - _____ - - PARPORT_CONTROL_INIT (nInit) - _________ - - PARPORT_CONTROL_SELECT (nSelectIn) - -SEE ALSO -^^^^^^^^ - -read_data, write_data, read_status, frob_control - - - -port->ops->frob_control - write control register bits ------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - unsigned char (*frob_control) (struct parport *port, - unsigned char mask, - unsigned char val); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -This is equivalent to reading from the control register, masking out -the bits in mask, exclusive-or'ing with the bits in val, and writing -the result to the control register. - -As some ports don't allow reads from the control port, a software copy -of its contents is maintained, so frob_control is in fact only one -port access. - -SEE ALSO -^^^^^^^^ - -read_data, write_data, read_status, write_control - - - -port->ops->enable_irq - enable interrupt generation ---------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*enable_irq) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -The parallel port hardware is instructed to generate interrupts at -appropriate moments, although those moments are -architecture-specific. For the PC architecture, interrupts are -commonly generated on the rising edge of nAck. - -SEE ALSO -^^^^^^^^ - -disable_irq - - - -port->ops->disable_irq - disable interrupt generation ------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*disable_irq) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -The parallel port hardware is instructed not to generate interrupts. -The interrupt itself is not masked. - -SEE ALSO -^^^^^^^^ - -enable_irq - - - -port->ops->data_forward - enable data drivers ---------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*data_forward) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Enables the data line drivers, for 8-bit host-to-peripheral -communications. - -SEE ALSO -^^^^^^^^ - -data_reverse - - - -port->ops->data_reverse - tristate the buffer ---------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - void (*data_reverse) (struct parport *port); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Places the data bus in a high impedance state, if port->modes has the -PARPORT_MODE_TRISTATE bit set. - -SEE ALSO -^^^^^^^^ - -data_forward - - - -port->ops->epp_write_data - write EPP data ------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*epp_write_data) (struct parport *port, const void *buf, - size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes data in EPP mode, and returns the number of bytes written. - -The ``flags`` parameter may be one or more of the following, -bitwise-or'ed together: - -======================= ================================================= -PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and - 32-bit registers. However, if a transfer - times out, the return value may be unreliable. -======================= ================================================= - -SEE ALSO -^^^^^^^^ - -epp_read_data, epp_write_addr, epp_read_addr - - - -port->ops->epp_read_data - read EPP data ----------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*epp_read_data) (struct parport *port, void *buf, - size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads data in EPP mode, and returns the number of bytes read. - -The ``flags`` parameter may be one or more of the following, -bitwise-or'ed together: - -======================= ================================================= -PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and - 32-bit registers. However, if a transfer - times out, the return value may be unreliable. -======================= ================================================= - -SEE ALSO -^^^^^^^^ - -epp_write_data, epp_write_addr, epp_read_addr - - - -port->ops->epp_write_addr - write EPP address ---------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*epp_write_addr) (struct parport *port, - const void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes EPP addresses (8 bits each), and returns the number written. - -The ``flags`` parameter may be one or more of the following, -bitwise-or'ed together: - -======================= ================================================= -PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and - 32-bit registers. However, if a transfer - times out, the return value may be unreliable. -======================= ================================================= - -(Does PARPORT_EPP_FAST make sense for this function?) - -SEE ALSO -^^^^^^^^ - -epp_write_data, epp_read_data, epp_read_addr - - - -port->ops->epp_read_addr - read EPP address -------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*epp_read_addr) (struct parport *port, void *buf, - size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads EPP addresses (8 bits each), and returns the number read. - -The ``flags`` parameter may be one or more of the following, -bitwise-or'ed together: - -======================= ================================================= -PARPORT_EPP_FAST Use fast transfers. Some chips provide 16-bit and - 32-bit registers. However, if a transfer - times out, the return value may be unreliable. -======================= ================================================= - -(Does PARPORT_EPP_FAST make sense for this function?) - -SEE ALSO -^^^^^^^^ - -epp_write_data, epp_read_data, epp_write_addr - - - -port->ops->ecp_write_data - write a block of ECP data ------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*ecp_write_data) (struct parport *port, - const void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes a block of ECP data. The ``flags`` parameter is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of bytes written. - -SEE ALSO -^^^^^^^^ - -ecp_read_data, ecp_write_addr - - - -port->ops->ecp_read_data - read a block of ECP data ---------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*ecp_read_data) (struct parport *port, - void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads a block of ECP data. The ``flags`` parameter is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of bytes read. NB. There may be more unread data in a -FIFO. Is there a way of stunning the FIFO to prevent this? - -SEE ALSO -^^^^^^^^ - -ecp_write_block, ecp_write_addr - - - -port->ops->ecp_write_addr - write a block of ECP addresses ----------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*ecp_write_addr) (struct parport *port, - const void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes a block of ECP addresses. The ``flags`` parameter is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of bytes written. - -NOTES -^^^^^ - -This may use a FIFO, and if so shall not return until the FIFO is empty. - -SEE ALSO -^^^^^^^^ - -ecp_read_data, ecp_write_data - - - -port->ops->nibble_read_data - read a block of data in nibble mode ------------------------------------------------------------------ - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*nibble_read_data) (struct parport *port, - void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads a block of data in nibble mode. The ``flags`` parameter is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of whole bytes read. - -SEE ALSO -^^^^^^^^ - -byte_read_data, compat_write_data - - - -port->ops->byte_read_data - read a block of data in byte mode -------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*byte_read_data) (struct parport *port, - void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Reads a block of data in byte mode. The ``flags`` parameter is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of bytes read. - -SEE ALSO -^^^^^^^^ - -nibble_read_data, compat_write_data - - - -port->ops->compat_write_data - write a block of data in compatibility mode --------------------------------------------------------------------------- - -SYNOPSIS -^^^^^^^^ - -:: - - #include - - struct parport_operations { - ... - size_t (*compat_write_data) (struct parport *port, - const void *buf, size_t len, int flags); - ... - }; - -DESCRIPTION -^^^^^^^^^^^ - -Writes a block of data in compatibility mode. The ``flags`` parameter -is ignored. - -RETURN VALUE -^^^^^^^^^^^^ - -The number of bytes written. - -SEE ALSO -^^^^^^^^ - -nibble_read_data, byte_read_data diff --git a/Documentation/pti/pti_intel_mid.rst b/Documentation/pti/pti_intel_mid.rst deleted file mode 100644 index ea05725174cb..000000000000 --- a/Documentation/pti/pti_intel_mid.rst +++ /dev/null @@ -1,106 +0,0 @@ -:orphan: - -============= -Intel MID PTI -============= - -The Intel MID PTI project is HW implemented in Intel Atom -system-on-a-chip designs based on the Parallel Trace -Interface for MIPI P1149.7 cJTAG standard. The kernel solution -for this platform involves the following files:: - - ./include/linux/pti.h - ./drivers/.../n_tracesink.h - ./drivers/.../n_tracerouter.c - ./drivers/.../n_tracesink.c - ./drivers/.../pti.c - -pti.c is the driver that enables various debugging features -popular on platforms from certain mobile manufacturers. -n_tracerouter.c and n_tracesink.c allow extra system information to -be collected and routed to the pti driver, such as trace -debugging data from a modem. Although n_tracerouter -and n_tracesink are a part of the complete PTI solution, -these two line disciplines can work separately from -pti.c and route any data stream from one /dev/tty node -to another /dev/tty node via kernel-space. This provides -a stable, reliable connection that will not break unless -the user-space application shuts down (plus avoids -kernel->user->kernel context switch overheads of routing -data). - -An example debugging usage for this driver system: - - * Hook /dev/ttyPTI0 to syslogd. Opening this port will also start - a console device to further capture debugging messages to PTI. - * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW. - This is where n_tracerouter and n_tracesink are used. - * Hook /dev/pti to a user-level debugging application for writing - to PTI HW. - * `Use mipi_` Kernel Driver API in other device drivers for - debugging to PTI by first requesting a PTI write address via - mipi_request_masterchannel(1). - -Below is example pseudo-code on how a 'privileged' application -can hook up n_tracerouter and n_tracesink to any tty on -a system. 'Privileged' means the application has enough -privileges to successfully manipulate the ldisc drivers -but is not just blindly executing as 'root'. Keep in mind -the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter -and n_tracesink line discpline drivers but is a generic -operation for a program to use a line discpline driver -on a tty port other than the default n_tty:: - - /////////// To hook up n_tracerouter and n_tracesink ///////// - - // Note that n_tracerouter depends on n_tracesink. - #include - #define ONE_TTY "/dev/ttyOne" - #define TWO_TTY "/dev/ttyTwo" - - // needed global to hand onto ldisc connection - static int g_fd_source = -1; - static int g_fd_sink = -1; - - // these two vars used to grab LDISC values from loaded ldisc drivers - // in OS. Look at /proc/tty/ldiscs to get the right numbers from - // the ldiscs loaded in the system. - int source_ldisc_num, sink_ldisc_num = -1; - int retval; - - g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W - g_fd_sink = open(TWO_TTY, O_RDWR); // must be R/W - - if (g_fd_source <= 0) || (g_fd_sink <= 0) { - // doubt you'll want to use these exact error lines of code - printf("Error on open(). errno: %d\n",errno); - return errno; - } - - retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num); - if (retval < 0) { - printf("Error on ioctl(). errno: %d\n", errno); - return errno; - } - - retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num); - if (retval < 0) { - printf("Error on ioctl(). errno: %d\n", errno); - return errno; - } - - /////////// To disconnect n_tracerouter and n_tracesink //////// - - // First make sure data through the ldiscs has stopped. - - // Second, disconnect ldiscs. This provides a - // little cleaner shutdown on tty stack. - sink_ldisc_num = 0; - source_ldisc_num = 0; - ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num); - ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num); - - // Three, program closes connection, and cleanup: - close(g_fd_uart); - close(g_fd_gadget); - g_fd_uart = g_fd_gadget = NULL; diff --git a/Documentation/pwm.txt b/Documentation/pwm.txt deleted file mode 100644 index ab62f1bb0366..000000000000 --- a/Documentation/pwm.txt +++ /dev/null @@ -1,165 +0,0 @@ -====================================== -Pulse Width Modulation (PWM) interface -====================================== - -This provides an overview about the Linux PWM interface - -PWMs are commonly used for controlling LEDs, fans or vibrators in -cell phones. PWMs with a fixed purpose have no need implementing -the Linux PWM API (although they could). However, PWMs are often -found as discrete devices on SoCs which have no fixed purpose. It's -up to the board designer to connect them to LEDs or fans. To provide -this kind of flexibility the generic PWM API exists. - -Identifying PWMs ----------------- - -Users of the legacy PWM API use unique IDs to refer to PWM devices. - -Instead of referring to a PWM device via its unique ID, board setup code -should instead register a static mapping that can be used to match PWM -consumers to providers, as given in the following example:: - - static struct pwm_lookup board_pwm_lookup[] = { - PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL, - 50000, PWM_POLARITY_NORMAL), - }; - - static void __init board_init(void) - { - ... - pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup)); - ... - } - -Using PWMs ----------- - -Legacy users can request a PWM device using pwm_request() and free it -after usage with pwm_free(). - -New users should use the pwm_get() function and pass to it the consumer -device or a consumer name. pwm_put() is used to free the PWM device. Managed -variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist. - -After being requested, a PWM has to be configured using:: - - int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state); - -This API controls both the PWM period/duty_cycle config and the -enable/disable state. - -The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers -around pwm_apply_state() and should not be used if the user wants to change -several parameter at once. For example, if you see pwm_config() and -pwm_{enable,disable}() calls in the same function, this probably means you -should switch to pwm_apply_state(). - -The PWM user API also allows one to query the PWM state with pwm_get_state(). - -In addition to the PWM state, the PWM API also exposes PWM arguments, which -are the reference PWM config one should use on this PWM. -PWM arguments are usually platform-specific and allows the PWM user to only -care about dutycycle relatively to the full period (like, duty = 50% of the -period). struct pwm_args contains 2 fields (period and polarity) and should -be used to set the initial PWM config (usually done in the probe function -of the PWM user). PWM arguments are retrieved with pwm_get_args(). - -All consumers should really be reconfiguring the PWM upon resume as -appropriate. This is the only way to ensure that everything is resumed in -the proper order. - -Using PWMs with the sysfs interface ------------------------------------ - -If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs -interface is provided to use the PWMs from userspace. It is exposed at -/sys/class/pwm/. Each probed PWM controller/chip will be exported as -pwmchipN, where N is the base of the PWM chip. Inside the directory you -will find: - - npwm - The number of PWM channels this chip supports (read-only). - - export - Exports a PWM channel for use with sysfs (write-only). - - unexport - Unexports a PWM channel from sysfs (write-only). - -The PWM channels are numbered using a per-chip index from 0 to npwm-1. - -When a PWM channel is exported a pwmX directory will be created in the -pwmchipN directory it is associated with, where X is the number of the -channel that was exported. The following properties will then be available: - - period - The total period of the PWM signal (read/write). - Value is in nanoseconds and is the sum of the active and inactive - time of the PWM. - - duty_cycle - The active time of the PWM signal (read/write). - Value is in nanoseconds and must be less than the period. - - polarity - Changes the polarity of the PWM signal (read/write). - Writes to this property only work if the PWM chip supports changing - the polarity. The polarity can only be changed if the PWM is not - enabled. Value is the string "normal" or "inversed". - - enable - Enable/disable the PWM signal (read/write). - - - 0 - disabled - - 1 - enabled - -Implementing a PWM driver -------------------------- - -Currently there are two ways to implement pwm drivers. Traditionally -there only has been the barebone API meaning that each driver has -to implement the pwm_*() functions itself. This means that it's impossible -to have multiple PWM drivers in the system. For this reason it's mandatory -for new drivers to use the generic PWM framework. - -A new PWM controller/chip can be added using pwmchip_add() and removed -again with pwmchip_remove(). pwmchip_add() takes a filled in struct -pwm_chip as argument which provides a description of the PWM chip, the -number of PWM devices provided by the chip and the chip-specific -implementation of the supported PWM operations to the framework. - -When implementing polarity support in a PWM driver, make sure to respect the -signal conventions in the PWM framework. By definition, normal polarity -characterizes a signal starts high for the duration of the duty cycle and -goes low for the remainder of the period. Conversely, a signal with inversed -polarity starts low for the duration of the duty cycle and goes high for the -remainder of the period. - -Drivers are encouraged to implement ->apply() instead of the legacy -->enable(), ->disable() and ->config() methods. Doing that should provide -atomicity in the PWM config workflow, which is required when the PWM controls -a critical device (like a regulator). - -The implementation of ->get_state() (a method used to retrieve initial PWM -state) is also encouraged for the same reason: letting the PWM user know -about the current PWM state would allow him to avoid glitches. - -Drivers should not implement any power management. In other words, -consumers should implement it as described in the "Using PWMs" section. - -Locking -------- - -The PWM core list manipulations are protected by a mutex, so pwm_request() -and pwm_free() may not be called from an atomic context. Currently the -PWM core does not enforce any locking to pwm_enable(), pwm_disable() and -pwm_config(), so the calling context is currently driver specific. This -is an issue derived from the former barebone API and should be fixed soon. - -Helpers -------- - -Currently a PWM can only be configured with period_ns and duty_ns. For several -use cases freq_hz and duty_percent might be better. Instead of calculating -this in your driver please consider adding appropriate helpers to the framework. diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt deleted file mode 100644 index 7d3684e81df6..000000000000 --- a/Documentation/rfkill.txt +++ /dev/null @@ -1,132 +0,0 @@ -=============================== -rfkill - RF kill switch support -=============================== - - -.. contents:: - :depth: 2 - -Introduction -============ - -The rfkill subsystem provides a generic interface for disabling any radio -transmitter in the system. When a transmitter is blocked, it shall not -radiate any power. - -The subsystem also provides the ability to react on button presses and -disable all transmitters of a certain type (or all). This is intended for -situations where transmitters need to be turned off, for example on -aircraft. - -The rfkill subsystem has a concept of "hard" and "soft" block, which -differ little in their meaning (block == transmitters off) but rather in -whether they can be changed or not: - - - hard block - read-only radio block that cannot be overridden by software - - - soft block - writable radio block (need not be readable) that is set by - the system software. - -The rfkill subsystem has two parameters, rfkill.default_state and -rfkill.master_switch_mode, which are documented in -admin-guide/kernel-parameters.rst. - - -Implementation details -====================== - -The rfkill subsystem is composed of three main components: - - * the rfkill core, - * the deprecated rfkill-input module (an input layer handler, being - replaced by userspace policy code) and - * the rfkill drivers. - -The rfkill core provides API for kernel drivers to register their radio -transmitter with the kernel, methods for turning it on and off, and letting -the system know about hardware-disabled states that may be implemented on -the device. - -The rfkill core code also notifies userspace of state changes, and provides -ways for userspace to query the current states. See the "Userspace support" -section below. - -When the device is hard-blocked (either by a call to rfkill_set_hw_state() -or from query_hw_block), set_block() will be invoked for additional software -block, but drivers can ignore the method call since they can use the return -value of the function rfkill_set_hw_state() to sync the software state -instead of keeping track of calls to set_block(). In fact, drivers should -use the return value of rfkill_set_hw_state() unless the hardware actually -keeps track of soft and hard block separately. - - -Kernel API -========== - -Drivers for radio transmitters normally implement an rfkill driver. - -Platform drivers might implement input devices if the rfkill button is just -that, a button. If that button influences the hardware then you need to -implement an rfkill driver instead. This also applies if the platform provides -a way to turn on/off the transmitter(s). - -For some platforms, it is possible that the hardware state changes during -suspend/hibernation, in which case it will be necessary to update the rfkill -core with the current state at resume time. - -To create an rfkill driver, driver's Kconfig needs to have:: - - depends on RFKILL || !RFKILL - -to ensure the driver cannot be built-in when rfkill is modular. The !RFKILL -case allows the driver to be built when rfkill is not configured, in which -case all rfkill API can still be used but will be provided by static inlines -which compile to almost nothing. - -Calling rfkill_set_hw_state() when a state change happens is required from -rfkill drivers that control devices that can be hard-blocked unless they also -assign the poll_hw_block() callback (then the rfkill core will poll the -device). Don't do this unless you cannot get the event in any other way. - -rfkill provides per-switch LED triggers, which can be used to drive LEDs -according to the switch state (LED_FULL when blocked, LED_OFF otherwise). - - -Userspace support -================= - -The recommended userspace interface to use is /dev/rfkill, which is a misc -character device that allows userspace to obtain and set the state of rfkill -devices and sets of devices. It also notifies userspace about device addition -and removal. The API is a simple read/write API that is defined in -linux/rfkill.h, with one ioctl that allows turning off the deprecated input -handler in the kernel for the transition period. - -Except for the one ioctl, communication with the kernel is done via read() -and write() of instances of 'struct rfkill_event'. In this structure, the -soft and hard block are properly separated (unlike sysfs, see below) and -userspace is able to get a consistent snapshot of all rfkill devices in the -system. Also, it is possible to switch all rfkill drivers (or all drivers of -a specified type) into a state which also updates the default state for -hotplugged devices. - -After an application opens /dev/rfkill, it can read the current state of all -devices. Changes can be obtained by either polling the descriptor for -hotplug or state change events or by listening for uevents emitted by the -rfkill core framework. - -Additionally, each rfkill device is registered in sysfs and emits uevents. - -rfkill devices issue uevents (with an action of "change"), with the following -environment variables set:: - - RFKILL_NAME - RFKILL_STATE - RFKILL_TYPE - -The content of these variables corresponds to the "name", "state" and -"type" sysfs files explained above. - -For further details consult Documentation/ABI/stable/sysfs-class-rfkill. diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 1f6d0b56d53e..1e210c6afa88 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -38,7 +38,7 @@ every detail. More information/reference could be found here: qemu/hw/s390x/css.c For vfio mediated device framework: -- Documentation/vfio-mediated-device.txt +- Documentation/driver-api/vfio-mediated-device.rst Motivation of vfio-ccw ---------------------- @@ -322,5 +322,5 @@ Reference 2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) 3. https://en.wikipedia.org/wiki/Channel_I/O 4. Documentation/s390/cds.rst -5. Documentation/vfio.txt -6. Documentation/vfio-mediated-device.txt +5. Documentation/driver-api/vfio.rst +6. Documentation/driver-api/vfio-mediated-device.rst diff --git a/Documentation/sgi-ioc4.txt b/Documentation/sgi-ioc4.txt deleted file mode 100644 index 72709222d3c0..000000000000 --- a/Documentation/sgi-ioc4.txt +++ /dev/null @@ -1,49 +0,0 @@ -==================================== -SGI IOC4 PCI (multi function) device -==================================== - -The SGI IOC4 PCI device is a bit of a strange beast, so some notes on -it are in order. - -First, even though the IOC4 performs multiple functions, such as an -IDE controller, a serial controller, a PS/2 keyboard/mouse controller, -and an external interrupt mechanism, it's not implemented as a -multifunction device. The consequence of this from a software -standpoint is that all these functions share a single IRQ, and -they can't all register to own the same PCI device ID. To make -matters a bit worse, some of the register blocks (and even registers -themselves) present in IOC4 are mixed-purpose between these several -functions, meaning that there's no clear "owning" device driver. - -The solution is to organize the IOC4 driver into several independent -drivers, "ioc4", "sgiioc4", and "ioc4_serial". Note that there is no -PS/2 controller driver as this functionality has never been wired up -on a shipping IO card. - -ioc4 -==== -This is the core (or shim) driver for IOC4. It is responsible for -initializing the basic functionality of the chip, and allocating -the PCI resources that are shared between the IOC4 functions. - -This driver also provides registration functions that the other -IOC4 drivers can call to make their presence known. Each driver -needs to provide a probe and remove function, which are invoked -by the core driver at appropriate times. The interface of these -IOC4 function probe and remove operations isn't precisely the same -as PCI device probe and remove operations, but is logically the -same operation. - -sgiioc4 -======= -This is the IDE driver for IOC4. Its name isn't very descriptive -simply for historical reasons (it used to be the only IOC4 driver -component). There's not much to say about it other than it hooks -up to the ioc4 driver via the appropriate registration, probe, and -remove functions. - -ioc4_serial -=========== -This is the serial driver for IOC4. There's not much to say about it -other than it hooks up to the ioc4 driver via the appropriate registration, -probe, and remove functions. diff --git a/Documentation/smsc_ece1099.txt b/Documentation/smsc_ece1099.txt deleted file mode 100644 index 079277421eaf..000000000000 --- a/Documentation/smsc_ece1099.txt +++ /dev/null @@ -1,60 +0,0 @@ -================================================= -Msc Keyboard Scan Expansion/GPIO Expansion device -================================================= - -What is smsc-ece1099? ----------------------- - -The ECE1099 is a 40-Pin 3.3V Keyboard Scan Expansion -or GPIO Expansion device. The device supports a keyboard -scan matrix of 23x8. The device is connected to a Master -via the SMSC BC-Link interface or via the SMBus. -Keypad scan Input(KSI) and Keypad Scan Output(KSO) signals -are multiplexed with GPIOs. - -Interrupt generation --------------------- - -Interrupts can be generated by an edge detection on a GPIO -pin or an edge detection on one of the bus interface pins. -Interrupts can also be detected on the keyboard scan interface. -The bus interrupt pin (BC_INT# or SMBUS_INT#) is asserted if -any bit in one of the Interrupt Status registers is 1 and -the corresponding Interrupt Mask bit is also 1. - -In order for software to determine which device is the source -of an interrupt, it should first read the Group Interrupt Status Register -to determine which Status register group is a source for the interrupt. -Software should read both the Status register and the associated Mask register, -then AND the two values together. Bits that are 1 in the result of the AND -are active interrupts. Software clears an interrupt by writing a 1 to the -corresponding bit in the Status register. - -Communication Protocol ----------------------- - -- SMbus slave Interface - The host processor communicates with the ECE1099 device - through a series of read/write registers via the SMBus - interface. SMBus is a serial communication protocol between - a computer host and its peripheral devices. The SMBus data - rate is 10KHz minimum to 400 KHz maximum - -- Slave Bus Interface - The ECE1099 device SMBus implementation is a subset of the - SMBus interface to the host. The device is a slave-only SMBus device. - The implementation in the device is a subset of SMBus since it - only supports four protocols. - - The Write Byte, Read Byte, Send Byte, and Receive Byte protocols are the - only valid SMBus protocols for the device. - -- BC-LinkTM Interface - The BC-Link is a proprietary bus that allows communication - between a Master device and a Companion device. The Master - device uses this serial bus to read and write registers - located on the Companion device. The bus comprises three signals, - BC_CLK, BC_DAT and BC_INT#. The Master device always provides the - clock, BC_CLK, and the Companion device is the source for an - independent asynchronous interrupt signal, BC_INT#. The ECE1099 - supports BC-Link speeds up to 24MHz. diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt deleted file mode 100644 index 30d6a64e53f7..000000000000 --- a/Documentation/switchtec.txt +++ /dev/null @@ -1,102 +0,0 @@ -======================== -Linux Switchtec Support -======================== - -Microsemi's "Switchtec" line of PCI switch devices is already -supported by the kernel with standard PCI switch drivers. However, the -Switchtec device advertises a special management endpoint which -enables some additional functionality. This includes: - -* Packet and Byte Counters -* Firmware Upgrades -* Event and Error logs -* Querying port link status -* Custom user firmware commands - -The switchtec kernel module implements this functionality. - - -Interface -========= - -The primary means of communicating with the Switchtec management firmware is -through the Memory-mapped Remote Procedure Call (MRPC) interface. -Commands are submitted to the interface with a 4-byte command -identifier and up to 1KB of command specific data. The firmware will -respond with a 4-byte return code and up to 1KB of command-specific -data. The interface only processes a single command at a time. - - -Userspace Interface -=================== - -The MRPC interface will be exposed to userspace through a simple char -device: /dev/switchtec#, one for each management endpoint in the system. - -The char device has the following semantics: - -* A write must consist of at least 4 bytes and no more than 1028 bytes. - The first 4 bytes will be interpreted as the Command ID and the - remainder will be used as the input data. A write will send the - command to the firmware to begin processing. - -* Each write must be followed by exactly one read. Any double write will - produce an error and any read that doesn't follow a write will - produce an error. - -* A read will block until the firmware completes the command and return - the 4-byte Command Return Value plus up to 1024 bytes of output - data. (The length will be specified by the size parameter of the read - call -- reading less than 4 bytes will produce an error.) - -* The poll call will also be supported for userspace applications that - need to do other things while waiting for the command to complete. - -The following IOCTLs are also supported by the device: - -* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number - of partitions in the device. - -* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for - any specified partition in flash. - -* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps - indicating all uncleared events. - -* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags - for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct - with the event_id, index and flags set (index being the partition or PFF - number for non-global events). It returns whether the event has - occurred, the number of times and any event specific data. The flags - can be used to clear the count or enable and disable actions to - happen when the event occurs. - By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag, - you can set an event to trigger a poll command to return with - POLLPRI. In this way, userspace can wait for events to occur. - -* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert - between PCI Function Framework number (used by the event system) - and Switchtec Logic Port ID and Partition number (which is more - user friendly). - - -Non-Transparent Bridge (NTB) Driver -=================================== - -An NTB hardware driver is provided for the Switchtec hardware in -ntb_hw_switchtec. Currently, it only supports switches configured with -exactly 2 NT partitions and zero or more non-NT partitions. It also requires -the following configuration settings: - -* Both NT partitions must be able to access each other's GAS spaces. - Thus, the bits in the GAS Access Vector under Management Settings - must be set to support this. -* Kernel configuration MUST include support for NTB (CONFIG_NTB needs - to be set) - -NT EP BAR 2 will be dynamically configured as a Direct Window, and -the configuration file does not need to configure it explicitly. - -Please refer to Documentation/ntb.txt in Linux source tree for an overall -understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB -Hardware Driver in this stack. diff --git a/Documentation/sync_file.txt b/Documentation/sync_file.txt deleted file mode 100644 index 496fb2c3b3e6..000000000000 --- a/Documentation/sync_file.txt +++ /dev/null @@ -1,86 +0,0 @@ -=================== -Sync File API Guide -=================== - -:Author: Gustavo Padovan - -This document serves as a guide for device drivers writers on what the -sync_file API is, and how drivers can support it. Sync file is the carrier of -the fences(struct dma_fence) that are needed to synchronize between drivers or -across process boundaries. - -The sync_file API is meant to be used to send and receive fence information -to/from userspace. It enables userspace to do explicit fencing, where instead -of attaching a fence to the buffer a producer driver (such as a GPU or V4L -driver) sends the fence related to the buffer to userspace via a sync_file. - -The sync_file then can be sent to the consumer (DRM driver for example), that -will not use the buffer for anything before the fence(s) signals, i.e., the -driver that issued the fence is not using/processing the buffer anymore, so it -signals that the buffer is ready to use. And vice-versa for the consumer -> -producer part of the cycle. - -Sync files allows userspace awareness on buffer sharing synchronization between -drivers. - -Sync file was originally added in the Android kernel but current Linux Desktop -can benefit a lot from it. - -in-fences and out-fences ------------------------- - -Sync files can go either to or from userspace. When a sync_file is sent from -the driver to userspace we call the fences it contains 'out-fences'. They are -related to a buffer that the driver is processing or is going to process, so -the driver creates an out-fence to be able to notify, through -dma_fence_signal(), when it has finished using (or processing) that buffer. -Out-fences are fences that the driver creates. - -On the other hand if the driver receives fence(s) through a sync_file from -userspace we call these fence(s) 'in-fences'. Receiving in-fences means that -we need to wait for the fence(s) to signal before using any buffer related to -the in-fences. - -Creating Sync Files -------------------- - -When a driver needs to send an out-fence userspace it creates a sync_file. - -Interface:: - - struct sync_file *sync_file_create(struct dma_fence *fence); - -The caller pass the out-fence and gets back the sync_file. That is just the -first step, next it needs to install an fd on sync_file->file. So it gets an -fd:: - - fd = get_unused_fd_flags(O_CLOEXEC); - -and installs it on sync_file->file:: - - fd_install(fd, sync_file->file); - -The sync_file fd now can be sent to userspace. - -If the creation process fail, or the sync_file needs to be released by any -other reason fput(sync_file->file) should be used. - -Receiving Sync Files from Userspace ------------------------------------ - -When userspace needs to send an in-fence to the driver it passes file descriptor -of the Sync File to the kernel. The kernel can then retrieve the fences -from it. - -Interface:: - - struct dma_fence *sync_file_get_fence(int fd); - - -The returned reference is owned by the caller and must be disposed of -afterwards using dma_fence_put(). In case of error, a NULL is returned instead. - -References: - -1. struct sync_file in include/linux/sync_file.h -2. All interfaces mentioned above defined in include/linux/sync_file.h diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt deleted file mode 100644 index c3f69bcaf96e..000000000000 --- a/Documentation/vfio-mediated-device.txt +++ /dev/null @@ -1,414 +0,0 @@ -.. include:: - -===================== -VFIO Mediated devices -===================== - -:Copyright: |copy| 2016, NVIDIA CORPORATION. All rights reserved. -:Author: Neo Jia -:Author: Kirti Wankhede - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License version 2 as -published by the Free Software Foundation. - - -Virtual Function I/O (VFIO) Mediated devices[1] -=============================================== - -The number of use cases for virtualizing DMA devices that do not have built-in -SR_IOV capability is increasing. Previously, to virtualize such devices, -developers had to create their own management interfaces and APIs, and then -integrate them with user space software. To simplify integration with user space -software, we have identified common requirements and a unified management -interface for such devices. - -The VFIO driver framework provides unified APIs for direct device access. It is -an IOMMU/device-agnostic framework for exposing direct device access to user -space in a secure, IOMMU-protected environment. This framework is used for -multiple devices, such as GPUs, network adapters, and compute accelerators. With -direct device access, virtual machines or user space applications have direct -access to the physical device. This framework is reused for mediated devices. - -The mediated core driver provides a common interface for mediated device -management that can be used by drivers of different devices. This module -provides a generic interface to perform these operations: - -* Create and destroy a mediated device -* Add a mediated device to and remove it from a mediated bus driver -* Add a mediated device to and remove it from an IOMMU group - -The mediated core driver also provides an interface to register a bus driver. -For example, the mediated VFIO mdev driver is designed for mediated devices and -supports VFIO APIs. The mediated bus driver adds a mediated device to and -removes it from a VFIO group. - -The following high-level block diagram shows the main components and interfaces -in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM -devices as examples, as these devices are the first devices to use this module:: - - +---------------+ - | | - | +-----------+ | mdev_register_driver() +--------------+ - | | | +<------------------------+ | - | | mdev | | | | - | | bus | +------------------------>+ vfio_mdev.ko |<-> VFIO user - | | driver | | probe()/remove() | | APIs - | | | | +--------------+ - | +-----------+ | - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +-----------+ | mdev_register_device() +--------------+ - | | | +<------------------------+ | - | | | | | nvidia.ko |<-> physical - | | | +------------------------>+ | device - | | | | callbacks +--------------+ - | | Physical | | - | | device | | mdev_register_device() +--------------+ - | | interface | |<------------------------+ | - | | | | | i915.ko |<-> physical - | | | +------------------------>+ | device - | | | | callbacks +--------------+ - | | | | - | | | | mdev_register_device() +--------------+ - | | | +<------------------------+ | - | | | | | ccw_device.ko|<-> physical - | | | +------------------------>+ | device - | | | | callbacks +--------------+ - | +-----------+ | - +---------------+ - - -Registration Interfaces -======================= - -The mediated core driver provides the following types of registration -interfaces: - -* Registration interface for a mediated bus driver -* Physical device driver interface - -Registration Interface for a Mediated Bus Driver ------------------------------------------------- - -The registration interface for a mediated bus driver provides the following -structure to represent a mediated device's driver:: - - /* - * struct mdev_driver [2] - Mediated device's driver - * @name: driver name - * @probe: called when new device created - * @remove: called when device removed - * @driver: device driver structure - */ - struct mdev_driver { - const char *name; - int (*probe) (struct device *dev); - void (*remove) (struct device *dev); - struct device_driver driver; - }; - -A mediated bus driver for mdev should use this structure in the function calls -to register and unregister itself with the core driver: - -* Register:: - - extern int mdev_register_driver(struct mdev_driver *drv, - struct module *owner); - -* Unregister:: - - extern void mdev_unregister_driver(struct mdev_driver *drv); - -The mediated bus driver is responsible for adding mediated devices to the VFIO -group when devices are bound to the driver and removing mediated devices from -the VFIO when devices are unbound from the driver. - - -Physical Device Driver Interface --------------------------------- - -The physical device driver interface provides the mdev_parent_ops[3] structure -to define the APIs to manage work in the mediated core driver that is related -to the physical device. - -The structures in the mdev_parent_ops structure are as follows: - -* dev_attr_groups: attributes of the parent device -* mdev_attr_groups: attributes of the mediated device -* supported_config: attributes to define supported configurations - -The functions in the mdev_parent_ops structure are as follows: - -* create: allocate basic resources in a driver for a mediated device -* remove: free resources in a driver when a mediated device is destroyed - -(Note that mdev-core provides no implicit serialization of create/remove -callbacks per mdev parent device, per mdev type, or any other categorization. -Vendor drivers are expected to be fully asynchronous in this respect or -provide their own internal resource protection.) - -The callbacks in the mdev_parent_ops structure are as follows: - -* open: open callback of mediated device -* close: close callback of mediated device -* ioctl: ioctl callback of mediated device -* read : read emulation callback -* write: write emulation callback -* mmap: mmap emulation callback - -A driver should use the mdev_parent_ops structure in the function call to -register itself with the mdev core driver:: - - extern int mdev_register_device(struct device *dev, - const struct mdev_parent_ops *ops); - -However, the mdev_parent_ops structure is not required in the function call -that a driver should use to unregister itself with the mdev core driver:: - - extern void mdev_unregister_device(struct device *dev); - - -Mediated Device Management Interface Through sysfs -================================================== - -The management interface through sysfs enables user space software, such as -libvirt, to query and configure mediated devices in a hardware-agnostic fashion. -This management interface provides flexibility to the underlying physical -device's driver to support features such as: - -* Mediated device hot plug -* Multiple mediated devices in a single virtual machine -* Multiple mediated devices from different physical devices - -Links in the mdev_bus Class Directory -------------------------------------- -The /sys/class/mdev_bus/ directory contains links to devices that are registered -with the mdev core driver. - -Directories and files under the sysfs for Each Physical Device --------------------------------------------------------------- - -:: - - |- [parent physical device] - |--- Vendor-specific-attributes [optional] - |--- [mdev_supported_types] - | |--- [] - | | |--- create - | | |--- name - | | |--- available_instances - | | |--- device_api - | | |--- description - | | |--- [devices] - | |--- [] - | | |--- create - | | |--- name - | | |--- available_instances - | | |--- device_api - | | |--- description - | | |--- [devices] - | |--- [] - | |--- create - | |--- name - | |--- available_instances - | |--- device_api - | |--- description - | |--- [devices] - -* [mdev_supported_types] - - The list of currently supported mediated device types and their details. - - [], device_api, and available_instances are mandatory attributes - that should be provided by vendor driver. - -* [] - - The [] name is created by adding the device driver string as a prefix - to the string provided by the vendor driver. This format of this name is as - follows:: - - sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name); - - (or using mdev_parent_dev(mdev) to arrive at the parent device outside - of the core mdev code) - -* device_api - - This attribute should show which device API is being created, for example, - "vfio-pci" for a PCI device. - -* available_instances - - This attribute should show the number of devices of type that can be - created. - -* [device] - - This directory contains links to the devices of type that have been - created. - -* name - - This attribute should show human readable name. This is optional attribute. - -* description - - This attribute should show brief features/description of the type. This is - optional attribute. - -Directories and Files Under the sysfs for Each mdev Device ----------------------------------------------------------- - -:: - - |- [parent phy device] - |--- [$MDEV_UUID] - |--- remove - |--- mdev_type {link to its type} - |--- vendor-specific-attributes [optional] - -* remove (write only) - -Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can -fail the remove() callback if that device is active and the vendor driver -doesn't support hot unplug. - -Example:: - - # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove - -Mediated device Hot plug ------------------------- - -Mediated devices can be created and assigned at runtime. The procedure to hot -plug a mediated device is the same as the procedure to hot plug a PCI device. - -Translation APIs for Mediated Devices -===================================== - -The following APIs are provided for translating user pfn to host pfn in a VFIO -driver:: - - extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, - int npage, int prot, unsigned long *phys_pfn); - - extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, - int npage); - -These functions call back into the back-end IOMMU module by using the pin_pages -and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently -these callbacks are supported in the TYPE1 IOMMU module. To enable them for -other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide -these two callback functions. - -Using the Sample Code -===================== - -mtty.c in samples/vfio-mdev/ directory is a sample driver program to -demonstrate how to use the mediated device framework. - -The sample driver creates an mdev device that simulates a serial port over a PCI -card. - -1. Build and load the mtty.ko module. - - This step creates a dummy device, /sys/devices/virtual/mtty/mtty/ - - Files in this device directory in sysfs are similar to the following:: - - # tree /sys/devices/virtual/mtty/mtty/ - /sys/devices/virtual/mtty/mtty/ - |-- mdev_supported_types - | |-- mtty-1 - | | |-- available_instances - | | |-- create - | | |-- device_api - | | |-- devices - | | `-- name - | `-- mtty-2 - | |-- available_instances - | |-- create - | |-- device_api - | |-- devices - | `-- name - |-- mtty_dev - | `-- sample_mtty_dev - |-- power - | |-- autosuspend_delay_ms - | |-- control - | |-- runtime_active_time - | |-- runtime_status - | `-- runtime_suspended_time - |-- subsystem -> ../../../../class/mtty - `-- uevent - -2. Create a mediated device by using the dummy device that you created in the - previous step:: - - # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ - /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create - -3. Add parameters to qemu-kvm:: - - -device vfio-pci,\ - sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 - -4. Boot the VM. - - In the Linux guest VM, with no hardware on the host, the device appears - as follows:: - - # lspci -s 00:05.0 -xxvv - 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) - Subsystem: Device 4348:3253 - Physical Slot: 5 - Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- - Stepping- SERR- FastB2B- DisINTx- - Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- - SERR- Link[LNKA] -> GSI 10 (level, high) -> IRQ 10 - 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A - 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A - - -5. In the Linux guest VM, check the serial ports:: - - # setserial -g /dev/ttyS* - /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4 - /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10 - /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10 - -6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or - /dev/ttyS2 with hardware flow control disabled. - -7. Type data on the minicom terminal or send data to the terminal emulation - program and read the data. - - Data is loop backed from hosts mtty driver. - -8. Destroy the mediated device that you created:: - - # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove - -References -========== - -1. See Documentation/vfio.txt for more information on VFIO. -2. struct mdev_driver in include/linux/mdev.h -3. struct mdev_parent_ops in include/linux/mdev.h -4. struct vfio_iommu_driver_ops in include/linux/vfio.h diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt deleted file mode 100644 index f1a4d3c3ba0b..000000000000 --- a/Documentation/vfio.txt +++ /dev/null @@ -1,520 +0,0 @@ -================================== -VFIO - "Virtual Function I/O" [1]_ -================================== - -Many modern system now provide DMA and interrupt remapping facilities -to help ensure I/O devices behave within the boundaries they've been -allotted. This includes x86 hardware with AMD-Vi and Intel VT-d, -POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC -systems such as Freescale PAMU. The VFIO driver is an IOMMU/device -agnostic framework for exposing direct device access to userspace, in -a secure, IOMMU protected environment. In other words, this allows -safe [2]_, non-privileged, userspace drivers. - -Why do we want that? Virtual machines often make use of direct device -access ("device assignment") when configured for the highest possible -I/O performance. From a device and host perspective, this simply -turns the VM into a userspace driver, with the benefits of -significantly reduced latency, higher bandwidth, and direct use of -bare-metal device drivers [3]_. - -Some applications, particularly in the high performance computing -field, also benefit from low-overhead, direct device access from -userspace. Examples include network adapters (often non-TCP/IP based) -and compute accelerators. Prior to VFIO, these drivers had to either -go through the full development cycle to become proper upstream -driver, be maintained out of tree, or make use of the UIO framework, -which has no notion of IOMMU protection, limited interrupt support, -and requires root privileges to access things like PCI configuration -space. - -The VFIO driver framework intends to unify these, replacing both the -KVM PCI specific device assignment code as well as provide a more -secure, more featureful userspace driver environment than UIO. - -Groups, Devices, and IOMMUs ---------------------------- - -Devices are the main target of any I/O driver. Devices typically -create a programming interface made up of I/O access, interrupts, -and DMA. Without going into the details of each of these, DMA is -by far the most critical aspect for maintaining a secure environment -as allowing a device read-write access to system memory imposes the -greatest risk to the overall system integrity. - -To help mitigate this risk, many modern IOMMUs now incorporate -isolation properties into what was, in many cases, an interface only -meant for translation (ie. solving the addressing problems of devices -with limited address spaces). With this, devices can now be isolated -from each other and from arbitrary memory access, thus allowing -things like secure direct assignment of devices into virtual machines. - -This isolation is not always at the granularity of a single device -though. Even when an IOMMU is capable of this, properties of devices, -interconnects, and IOMMU topologies can each reduce this isolation. -For instance, an individual device may be part of a larger multi- -function enclosure. While the IOMMU may be able to distinguish -between devices within the enclosure, the enclosure may not require -transactions between devices to reach the IOMMU. Examples of this -could be anything from a multi-function PCI device with backdoors -between functions to a non-PCI-ACS (Access Control Services) capable -bridge allowing redirection without reaching the IOMMU. Topology -can also play a factor in terms of hiding devices. A PCIe-to-PCI -bridge masks the devices behind it, making transaction appear as if -from the bridge itself. Obviously IOMMU design plays a major factor -as well. - -Therefore, while for the most part an IOMMU may have device level -granularity, any system is susceptible to reduced granularity. The -IOMMU API therefore supports a notion of IOMMU groups. A group is -a set of devices which is isolatable from all other devices in the -system. Groups are therefore the unit of ownership used by VFIO. - -While the group is the minimum granularity that must be used to -ensure secure user access, it's not necessarily the preferred -granularity. In IOMMUs which make use of page tables, it may be -possible to share a set of page tables between different groups, -reducing the overhead both to the platform (reduced TLB thrashing, -reduced duplicate page tables), and to the user (programming only -a single set of translations). For this reason, VFIO makes use of -a container class, which may hold one or more groups. A container -is created by simply opening the /dev/vfio/vfio character device. - -On its own, the container provides little functionality, with all -but a couple version and extension query interfaces locked away. -The user needs to add a group into the container for the next level -of functionality. To do this, the user first needs to identify the -group associated with the desired device. This can be done using -the sysfs links described in the example below. By unbinding the -device from the host driver and binding it to a VFIO driver, a new -VFIO group will appear for the group as /dev/vfio/$GROUP, where -$GROUP is the IOMMU group number of which the device is a member. -If the IOMMU group contains multiple devices, each will need to -be bound to a VFIO driver before operations on the VFIO group -are allowed (it's also sufficient to only unbind the device from -host drivers if a VFIO driver is unavailable; this will make the -group available, but not that particular device). TBD - interface -for disabling driver probing/locking a device. - -Once the group is ready, it may be added to the container by opening -the VFIO group character device (/dev/vfio/$GROUP) and using the -VFIO_GROUP_SET_CONTAINER ioctl, passing the file descriptor of the -previously opened container file. If desired and if the IOMMU driver -supports sharing the IOMMU context between groups, multiple groups may -be set to the same container. If a group fails to set to a container -with existing groups, a new empty container will need to be used -instead. - -With a group (or groups) attached to a container, the remaining -ioctls become available, enabling access to the VFIO IOMMU interfaces. -Additionally, it now becomes possible to get file descriptors for each -device within a group using an ioctl on the VFIO group file descriptor. - -The VFIO device API includes ioctls for describing the device, the I/O -regions and their read/write/mmap offsets on the device descriptor, as -well as mechanisms for describing and registering interrupt -notifications. - -VFIO Usage Example ------------------- - -Assume user wants to access PCI device 0000:06:0d.0:: - - $ readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group - ../../../../kernel/iommu_groups/26 - -This device is therefore in IOMMU group 26. This device is on the -pci bus, therefore the user will make use of vfio-pci to manage the -group:: - - # modprobe vfio-pci - -Binding this device to the vfio-pci driver creates the VFIO group -character devices for this group:: - - $ lspci -n -s 0000:06:0d.0 - 06:0d.0 0401: 1102:0002 (rev 08) - # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind - # echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id - -Now we need to look at what other devices are in the group to free -it for use by VFIO:: - - $ ls -l /sys/bus/pci/devices/0000:06:0d.0/iommu_group/devices - total 0 - lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:00:1e.0 -> - ../../../../devices/pci0000:00/0000:00:1e.0 - lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.0 -> - ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.0 - lrwxrwxrwx. 1 root root 0 Apr 23 16:13 0000:06:0d.1 -> - ../../../../devices/pci0000:00/0000:00:1e.0/0000:06:0d.1 - -This device is behind a PCIe-to-PCI bridge [4]_, therefore we also -need to add device 0000:06:0d.1 to the group following the same -procedure as above. Device 0000:00:1e.0 is a bridge that does -not currently have a host driver, therefore it's not required to -bind this device to the vfio-pci driver (vfio-pci does not currently -support PCI bridges). - -The final step is to provide the user with access to the group if -unprivileged operation is desired (note that /dev/vfio/vfio provides -no capabilities on its own and is therefore expected to be set to -mode 0666 by the system):: - - # chown user:user /dev/vfio/26 - -The user now has full access to all the devices and the iommu for this -group and can access them as follows:: - - int container, group, device, i; - struct vfio_group_status group_status = - { .argsz = sizeof(group_status) }; - struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; - struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) }; - struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; - - /* Create a new container */ - container = open("/dev/vfio/vfio", O_RDWR); - - if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) - /* Unknown API version */ - - if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) - /* Doesn't support the IOMMU driver we want. */ - - /* Open the group */ - group = open("/dev/vfio/26", O_RDWR); - - /* Test the group is viable and available */ - ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); - - if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) - /* Group is not viable (ie, not all devices bound for vfio) */ - - /* Add the group to the container */ - ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); - - /* Enable the IOMMU model we want */ - ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); - - /* Get addition IOMMU info */ - ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); - - /* Allocate some space and setup a DMA mapping */ - dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - dma_map.size = 1024 * 1024; - dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); - - /* Get a file descriptor for the device */ - device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); - - /* Test and setup the device */ - ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); - - for (i = 0; i < device_info.num_regions; i++) { - struct vfio_region_info reg = { .argsz = sizeof(reg) }; - - reg.index = i; - - ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®); - - /* Setup mappings... read/write offsets, mmaps - * For PCI devices, config space is a region */ - } - - for (i = 0; i < device_info.num_irqs; i++) { - struct vfio_irq_info irq = { .argsz = sizeof(irq) }; - - irq.index = i; - - ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq); - - /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */ - } - - /* Gratuitous device reset and go... */ - ioctl(device, VFIO_DEVICE_RESET); - -VFIO User API -------------------------------------------------------------------------------- - -Please see include/linux/vfio.h for complete API documentation. - -VFIO bus driver API -------------------------------------------------------------------------------- - -VFIO bus drivers, such as vfio-pci make use of only a few interfaces -into VFIO core. When devices are bound and unbound to the driver, -the driver should call vfio_add_group_dev() and vfio_del_group_dev() -respectively:: - - extern int vfio_add_group_dev(struct device *dev, - const struct vfio_device_ops *ops, - void *device_data); - - extern void *vfio_del_group_dev(struct device *dev); - -vfio_add_group_dev() indicates to the core to begin tracking the -iommu_group of the specified dev and register the dev as owned by -a VFIO bus driver. The driver provides an ops structure for callbacks -similar to a file operations structure:: - - struct vfio_device_ops { - int (*open)(void *device_data); - void (*release)(void *device_data); - ssize_t (*read)(void *device_data, char __user *buf, - size_t count, loff_t *ppos); - ssize_t (*write)(void *device_data, const char __user *buf, - size_t size, loff_t *ppos); - long (*ioctl)(void *device_data, unsigned int cmd, - unsigned long arg); - int (*mmap)(void *device_data, struct vm_area_struct *vma); - }; - -Each function is passed the device_data that was originally registered -in the vfio_add_group_dev() call above. This allows the bus driver -an easy place to store its opaque, private data. The open/release -callbacks are issued when a new file descriptor is created for a -device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides -a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap -interfaces implement the device region access defined by the device's -own VFIO_DEVICE_GET_REGION_INFO ioctl. - - -PPC64 sPAPR implementation note -------------------------------- - -This implementation has some specifics: - -1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per - container is supported as an IOMMU table is allocated at the boot time, - one table per a IOMMU group which is a Partitionable Endpoint (PE) - (PE is often a PCI domain but not always). - - Newer systems (POWER8 with IODA2) have improved hardware design which allows - to remove this limitation and have multiple IOMMU groups per a VFIO - container. - -2) The hardware supports so called DMA windows - the PCI address range - within which DMA transfer is allowed, any attempt to access address space - out of the window leads to the whole PE isolation. - -3) PPC64 guests are paravirtualized but not fully emulated. There is an API - to map/unmap pages for DMA, and it normally maps 1..32 pages per call and - currently there is no way to reduce the number of calls. In order to make - things faster, the map/unmap handling has been implemented in real mode - which provides an excellent performance which has limitations such as - inability to do locked pages accounting in real time. - -4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O - subtree that can be treated as a unit for the purposes of partitioning and - error recovery. A PE may be a single or multi-function IOA (IO Adapter), a - function of a multi-function IOA, or multiple IOAs (possibly including - switch and bridge structures above the multiple IOAs). PPC64 guests detect - PCI errors and recover from them via EEH RTAS services, which works on the - basis of additional ioctl commands. - - So 4 additional ioctls have been added: - - VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start of the DMA window on the PCI bus. - - VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting - is done at this point. This lets user first to know what - the DMA window is and adjust rlimit before doing any real job. - - VFIO_IOMMU_DISABLE - disables the container. - - VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery. - - The code flow from the example above should be slightly changed:: - - struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op), .flags = 0 }; - - ..... - /* Add the group to the container */ - ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); - - /* Enable the IOMMU model we want */ - ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) - - /* Get addition sPAPR IOMMU info */ - vfio_iommu_spapr_tce_info spapr_iommu_info; - ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info); - - if (ioctl(container, VFIO_IOMMU_ENABLE)) - /* Cannot enable container, may be low rlimit */ - - /* Allocate some space and setup a DMA mapping */ - dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - - dma_map.size = 1024 * 1024; - dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ - ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); - - /* Get a file descriptor for the device */ - device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0"); - - .... - - /* Gratuitous device reset and go... */ - ioctl(device, VFIO_DEVICE_RESET); - - /* Make sure EEH is supported */ - ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); - - /* Enable the EEH functionality on the device */ - pe_op.op = VFIO_EEH_PE_ENABLE; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* You're suggested to create additional data struct to represent - * PE, and put child devices belonging to same IOMMU group to the - * PE instance for later reference. - */ - - /* Check the PE's state and make sure it's in functional state */ - pe_op.op = VFIO_EEH_PE_GET_STATE; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* Save device state using pci_save_state(). - * EEH should be enabled on the specified device. - */ - - .... - - /* Inject EEH error, which is expected to be caused by 32-bits - * config load. - */ - pe_op.op = VFIO_EEH_PE_INJECT_ERR; - pe_op.err.type = EEH_ERR_TYPE_32; - pe_op.err.func = EEH_ERR_FUNC_LD_CFG_ADDR; - pe_op.err.addr = 0ul; - pe_op.err.mask = 0ul; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - .... - - /* When 0xFF's returned from reading PCI config space or IO BARs - * of the PCI device. Check the PE's state to see if that has been - * frozen. - */ - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* Waiting for pending PCI transactions to be completed and don't - * produce any more PCI traffic from/to the affected PE until - * recovery is finished. - */ - - /* Enable IO for the affected PE and collect logs. Usually, the - * standard part of PCI config space, AER registers are dumped - * as logs for further analysis. - */ - pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* - * Issue PE reset: hot or fundamental reset. Usually, hot reset - * is enough. However, the firmware of some PCI adapters would - * require fundamental reset. - */ - pe_op.op = VFIO_EEH_PE_RESET_HOT; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* Configure the PCI bridges for the affected PE */ - pe_op.op = VFIO_EEH_PE_CONFIGURE; - ioctl(container, VFIO_EEH_PE_OP, &pe_op); - - /* Restored state we saved at initialization time. pci_restore_state() - * is good enough as an example. - */ - - /* Hopefully, error is recovered successfully. Now, you can resume to - * start PCI traffic to/from the affected PE. - */ - - .... - -5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/ - VFIO_IOMMU_DISABLE and implements 2 new ioctls: - VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - (which are unsupported in v1 IOMMU). - - PPC64 paravirtualized guests generate a lot of map/unmap requests, - and the handling of those includes pinning/unpinning pages and updating - mm::locked_vm counter to make sure we do not exceed the rlimit. - The v2 IOMMU splits accounting and pinning into separate operations: - - - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls - receive a user space address and size of the block to be pinned. - Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to - be called with the exact address and size used for registering - the memory block. The userspace is not expected to call these often. - The ranges are stored in a linked list in a VFIO container. - - - VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual - IOMMU table and do not do pinning; instead these check that the userspace - address is from pre-registered range. - - This separation helps in optimizing DMA for guests. - -6) sPAPR specification allows guests to have an additional DMA window(s) on - a PCI bus with a variable page size. Two ioctls have been added to support - this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE. - The platform has to support the functionality or error will be returned to - the userspace. The existing hardware supports up to 2 DMA windows, one is - 2GB long, uses 4K pages and called "default 32bit window"; the other can - be as big as entire RAM, use different page size, it is optional - guests - create those in run-time if the guest driver supports 64bit DMA. - - VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and - a number of TCE table levels (if a TCE table is going to be big enough and - the kernel may not be able to allocate enough of physically contiguous - memory). It creates a new window in the available slot and returns the bus - address where the new window starts. Due to hardware limitation, the user - space cannot choose the location of DMA windows. - - VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window - and removes it. - -------------------------------------------------------------------------------- - -.. [1] VFIO was originally an acronym for "Virtual Function I/O" in its - initial implementation by Tom Lyon while as Cisco. We've since - outgrown the acronym, but it's catchy. - -.. [2] "safe" also depends upon a device being "well behaved". It's - possible for multi-function devices to have backdoors between - functions and even for single function devices to have alternative - access to things like PCI config space through MMIO registers. To - guard against the former we can include additional precautions in the - IOMMU driver to group multi-function PCI devices together - (iommu=group_mf). The latter we can't prevent, but the IOMMU should - still provide isolation. For PCI, SR-IOV Virtual Functions are the - best indicator of "well behaved", as these are designed for - virtualization usage models. - -.. [3] As always there are trade-offs to virtual machine device - assignment that are beyond the scope of VFIO. It's expected that - future IOMMU technologies will reduce some, but maybe not all, of - these trade-offs. - -.. [4] In this case the device is below a PCI bridge, so transactions - from either function of the device are indistinguishable to the iommu:: - - -[0000:00]-+-1e.0-[06]--+-0d.0 - \-0d.1 - - 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) diff --git a/Documentation/w1/w1.netlink b/Documentation/w1/w1.netlink index ef2727192d69..94ad4c420828 100644 --- a/Documentation/w1/w1.netlink +++ b/Documentation/w1/w1.netlink @@ -183,7 +183,7 @@ acknowledge number is set to seq+1. Additional documantion, source code examples. ============================================ -1. Documentation/connector +1. Documentation/driver-api/connector.rst 2. http://www.ioremap.net/archive/w1 This archive includes userspace application w1d.c which uses read/write/search commands for all master/slave devices found on the bus. diff --git a/Documentation/xillybus.txt b/Documentation/xillybus.txt deleted file mode 100644 index 2446ee303c09..000000000000 --- a/Documentation/xillybus.txt +++ /dev/null @@ -1,379 +0,0 @@ -========================================== -Xillybus driver for generic FPGA interface -========================================== - -:Author: Eli Billauer, Xillybus Ltd. (http://xillybus.com) -:Email: eli.billauer@gmail.com or as advertised on Xillybus' site. - -.. Contents: - - - Introduction - -- Background - -- Xillybus Overview - - - Usage - -- User interface - -- Synchronization - -- Seekable pipes - - - Internals - -- Source code organization - -- Pipe attributes - -- Host never reads from the FPGA - -- Channels, pipes, and the message channel - -- Data streaming - -- Data granularity - -- Probing - -- Buffer allocation - -- The "nonempty" message (supporting poll) - - -Introduction -============ - -Background ----------- - -An FPGA (Field Programmable Gate Array) is a piece of logic hardware, which -can be programmed to become virtually anything that is usually found as a -dedicated chipset: For instance, a display adapter, network interface card, -or even a processor with its peripherals. FPGAs are the LEGO of hardware: -Based upon certain building blocks, you make your own toys the way you like -them. It's usually pointless to reimplement something that is already -available on the market as a chipset, so FPGAs are mostly used when some -special functionality is needed, and the production volume is relatively low -(hence not justifying the development of an ASIC). - -The challenge with FPGAs is that everything is implemented at a very low -level, even lower than assembly language. In order to allow FPGA designers to -focus on their specific project, and not reinvent the wheel over and over -again, pre-designed building blocks, IP cores, are often used. These are the -FPGA parallels of library functions. IP cores may implement certain -mathematical functions, a functional unit (e.g. a USB interface), an entire -processor (e.g. ARM) or anything that might come handy. Think of them as a -building block, with electrical wires dangling on the sides for connection to -other blocks. - -One of the daunting tasks in FPGA design is communicating with a fullblown -operating system (actually, with the processor running it): Implementing the -low-level bus protocol and the somewhat higher-level interface with the host -(registers, interrupts, DMA etc.) is a project in itself. When the FPGA's -function is a well-known one (e.g. a video adapter card, or a NIC), it can -make sense to design the FPGA's interface logic specifically for the project. -A special driver is then written to present the FPGA as a well-known interface -to the kernel and/or user space. In that case, there is no reason to treat the -FPGA differently than any device on the bus. - -It's however common that the desired data communication doesn't fit any well- -known peripheral function. Also, the effort of designing an elegant -abstraction for the data exchange is often considered too big. In those cases, -a quicker and possibly less elegant solution is sought: The driver is -effectively written as a user space program, leaving the kernel space part -with just elementary data transport. This still requires designing some -interface logic for the FPGA, and write a simple ad-hoc driver for the kernel. - -Xillybus Overview ------------------ - -Xillybus is an IP core and a Linux driver. Together, they form a kit for -elementary data transport between an FPGA and the host, providing pipe-like -data streams with a straightforward user interface. It's intended as a low- -effort solution for mixed FPGA-host projects, for which it makes sense to -have the project-specific part of the driver running in a user-space program. - -Since the communication requirements may vary significantly from one FPGA -project to another (the number of data pipes needed in each direction and -their attributes), there isn't one specific chunk of logic being the Xillybus -IP core. Rather, the IP core is configured and built based upon a -specification given by its end user. - -Xillybus presents independent data streams, which resemble pipes or TCP/IP -communication to the user. At the host side, a character device file is used -just like any pipe file. On the FPGA side, hardware FIFOs are used to stream -the data. This is contrary to a common method of communicating through fixed- -sized buffers (even though such buffers are used by Xillybus under the hood). -There may be more than a hundred of these streams on a single IP core, but -also no more than one, depending on the configuration. - -In order to ease the deployment of the Xillybus IP core, it contains a simple -data structure which completely defines the core's configuration. The Linux -driver fetches this data structure during its initialization process, and sets -up the DMA buffers and character devices accordingly. As a result, a single -driver is used to work out of the box with any Xillybus IP core. - -The data structure just mentioned should not be confused with PCI's -configuration space or the Flattened Device Tree. - -Usage -===== - -User interface --------------- - -On the host, all interface with Xillybus is done through /dev/xillybus_* -device files, which are generated automatically as the drivers loads. The -names of these files depend on the IP core that is loaded in the FPGA (see -Probing below). To communicate with the FPGA, open the device file that -corresponds to the hardware FIFO you want to send data or receive data from, -and use plain write() or read() calls, just like with a regular pipe. In -particular, it makes perfect sense to go:: - - $ cat mydata > /dev/xillybus_thisfifo - - $ cat /dev/xillybus_thatfifo > hisdata - -possibly pressing CTRL-C as some stage, even though the xillybus_* pipes have -the capability to send an EOF (but may not use it). - -The driver and hardware are designed to behave sensibly as pipes, including: - -* Supporting non-blocking I/O (by setting O_NONBLOCK on open() ). - -* Supporting poll() and select(). - -* Being bandwidth efficient under load (using DMA) but also handle small - pieces of data sent across (like TCP/IP) by autoflushing. - -A device file can be read only, write only or bidirectional. Bidirectional -device files are treated like two independent pipes (except for sharing a -"channel" structure in the implementation code). - -Synchronization ---------------- - -Xillybus pipes are configured (on the IP core) to be either synchronous or -asynchronous. For a synchronous pipe, write() returns successfully only after -some data has been submitted and acknowledged by the FPGA. This slows down -bulk data transfers, and is nearly impossible for use with streams that -require data at a constant rate: There is no data transmitted to the FPGA -between write() calls, in particular when the process loses the CPU. - -When a pipe is configured asynchronous, write() returns if there was enough -room in the buffers to store any of the data in the buffers. - -For FPGA to host pipes, asynchronous pipes allow data transfer from the FPGA -as soon as the respective device file is opened, regardless of if the data -has been requested by a read() call. On synchronous pipes, only the amount -of data requested by a read() call is transmitted. - -In summary, for synchronous pipes, data between the host and FPGA is -transmitted only to satisfy the read() or write() call currently handled -by the driver, and those calls wait for the transmission to complete before -returning. - -Note that the synchronization attribute has nothing to do with the possibility -that read() or write() completes less bytes than requested. There is a -separate configuration flag ("allowpartial") that determines whether such a -partial completion is allowed. - -Seekable pipes --------------- - -A synchronous pipe can be configured to have the stream's position exposed -to the user logic at the FPGA. Such a pipe is also seekable on the host API. -With this feature, a memory or register interface can be attached on the -FPGA side to the seekable stream. Reading or writing to a certain address in -the attached memory is done by seeking to the desired address, and calling -read() or write() as required. - - -Internals -========= - -Source code organization ------------------------- - -The Xillybus driver consists of a core module, xillybus_core.c, and modules -that depend on the specific bus interface (xillybus_of.c and xillybus_pcie.c). - -The bus specific modules are those probed when a suitable device is found by -the kernel. Since the DMA mapping and synchronization functions, which are bus -dependent by their nature, are used by the core module, a -xilly_endpoint_hardware structure is passed to the core module on -initialization. This structure is populated with pointers to wrapper functions -which execute the DMA-related operations on the bus. - -Pipe attributes ---------------- - -Each pipe has a number of attributes which are set when the FPGA component -(IP core) is built. They are fetched from the IDT (the data structure which -defines the core's configuration, see Probing below) by xilly_setupchannels() -in xillybus_core.c as follows: - -* is_writebuf: The pipe's direction. A non-zero value means it's an FPGA to - host pipe (the FPGA "writes"). - -* channelnum: The pipe's identification number in communication between the - host and FPGA. - -* format: The underlying data width. See Data Granularity below. - -* allowpartial: A non-zero value means that a read() or write() (whichever - applies) may return with less than the requested number of bytes. The common - choice is a non-zero value, to match standard UNIX behavior. - -* synchronous: A non-zero value means that the pipe is synchronous. See - Synchronization above. - -* bufsize: Each DMA buffer's size. Always a power of two. - -* bufnum: The number of buffers allocated for this pipe. Always a power of two. - -* exclusive_open: A non-zero value forces exclusive opening of the associated - device file. If the device file is bidirectional, and already opened only in - one direction, the opposite direction may be opened once. - -* seekable: A non-zero value indicates that the pipe is seekable. See - Seekable pipes above. - -* supports_nonempty: A non-zero value (which is typical) indicates that the - hardware will send the messages that are necessary to support select() and - poll() for this pipe. - -Host never reads from the FPGA ------------------------------- - -Even though PCI Express is hotpluggable in general, a typical motherboard -doesn't expect a card to go away all of the sudden. But since the PCIe card -is based upon reprogrammable logic, a sudden disappearance from the bus is -quite likely as a result of an accidental reprogramming of the FPGA while the -host is up. In practice, nothing happens immediately in such a situation. But -if the host attempts to read from an address that is mapped to the PCI Express -device, that leads to an immediate freeze of the system on some motherboards, -even though the PCIe standard requires a graceful recovery. - -In order to avoid these freezes, the Xillybus driver refrains completely from -reading from the device's register space. All communication from the FPGA to -the host is done through DMA. In particular, the Interrupt Service Routine -doesn't follow the common practice of checking a status register when it's -invoked. Rather, the FPGA prepares a small buffer which contains short -messages, which inform the host what the interrupt was about. - -This mechanism is used on non-PCIe buses as well for the sake of uniformity. - - -Channels, pipes, and the message channel ----------------------------------------- - -Each of the (possibly bidirectional) pipes presented to the user is allocated -a data channel between the FPGA and the host. The distinction between channels -and pipes is necessary only because of channel 0, which is used for interrupt- -related messages from the FPGA, and has no pipe attached to it. - -Data streaming --------------- - -Even though a non-segmented data stream is presented to the user at both -sides, the implementation relies on a set of DMA buffers which is allocated -for each channel. For the sake of illustration, let's take the FPGA to host -direction: As data streams into the respective channel's interface in the -FPGA, the Xillybus IP core writes it to one of the DMA buffers. When the -buffer is full, the FPGA informs the host about that (appending a -XILLYMSG_OPCODE_RELEASEBUF message channel 0 and sending an interrupt if -necessary). The host responds by making the data available for reading through -the character device. When all data has been read, the host writes on the -the FPGA's buffer control register, allowing the buffer's overwriting. Flow -control mechanisms exist on both sides to prevent underflows and overflows. - -This is not good enough for creating a TCP/IP-like stream: If the data flow -stops momentarily before a DMA buffer is filled, the intuitive expectation is -that the partial data in buffer will arrive anyhow, despite the buffer not -being completed. This is implemented by adding a field in the -XILLYMSG_OPCODE_RELEASEBUF message, through which the FPGA informs not just -which buffer is submitted, but how much data it contains. - -But the FPGA will submit a partially filled buffer only if directed to do so -by the host. This situation occurs when the read() method has been blocking -for XILLY_RX_TIMEOUT jiffies (currently 10 ms), after which the host commands -the FPGA to submit a DMA buffer as soon as it can. This timeout mechanism -balances between bus bandwidth efficiency (preventing a lot of partially -filled buffers being sent) and a latency held fairly low for tails of data. - -A similar setting is used in the host to FPGA direction. The handling of -partial DMA buffers is somewhat different, though. The user can tell the -driver to submit all data it has in the buffers to the FPGA, by issuing a -write() with the byte count set to zero. This is similar to a flush request, -but it doesn't block. There is also an autoflushing mechanism, which triggers -an equivalent flush roughly XILLY_RX_TIMEOUT jiffies after the last write(). -This allows the user to be oblivious about the underlying buffering mechanism -and yet enjoy a stream-like interface. - -Note that the issue of partial buffer flushing is irrelevant for pipes having -the "synchronous" attribute nonzero, since synchronous pipes don't allow data -to lay around in the DMA buffers between read() and write() anyhow. - -Data granularity ----------------- - -The data arrives or is sent at the FPGA as 8, 16 or 32 bit wide words, as -configured by the "format" attribute. Whenever possible, the driver attempts -to hide this when the pipe is accessed differently from its natural alignment. -For example, reading single bytes from a pipe with 32 bit granularity works -with no issues. Writing single bytes to pipes with 16 or 32 bit granularity -will also work, but the driver can't send partially completed words to the -FPGA, so the transmission of up to one word may be held until it's fully -occupied with user data. - -This somewhat complicates the handling of host to FPGA streams, because -when a buffer is flushed, it may contain up to 3 bytes don't form a word in -the FPGA, and hence can't be sent. To prevent loss of data, these leftover -bytes need to be moved to the next buffer. The parts in xillybus_core.c -that mention "leftovers" in some way are related to this complication. - -Probing -------- - -As mentioned earlier, the number of pipes that are created when the driver -loads and their attributes depend on the Xillybus IP core in the FPGA. During -the driver's initialization, a blob containing configuration info, the -Interface Description Table (IDT), is sent from the FPGA to the host. The -bootstrap process is done in three phases: - -1. Acquire the length of the IDT, so a buffer can be allocated for it. This - is done by sending a quiesce command to the device, since the acknowledge - for this command contains the IDT's buffer length. - -2. Acquire the IDT itself. - -3. Create the interfaces according to the IDT. - -Buffer allocation ------------------ - -In order to simplify the logic that prevents illegal boundary crossings of -PCIe packets, the following rule applies: If a buffer is smaller than 4kB, -it must not cross a 4kB boundary. Otherwise, it must be 4kB aligned. The -xilly_setupchannels() functions allocates these buffers by requesting whole -pages from the kernel, and diving them into DMA buffers as necessary. Since -all buffers' sizes are powers of two, it's possible to pack any set of such -buffers, with a maximal waste of one page of memory. - -All buffers are allocated when the driver is loaded. This is necessary, -since large continuous physical memory segments are sometimes requested, -which are more likely to be available when the system is freshly booted. - -The allocation of buffer memory takes place in the same order they appear in -the IDT. The driver relies on a rule that the pipes are sorted with decreasing -buffer size in the IDT. If a requested buffer is larger or equal to a page, -the necessary number of pages is requested from the kernel, and these are -used for this buffer. If the requested buffer is smaller than a page, one -single page is requested from the kernel, and that page is partially used. -Or, if there already is a partially used page at hand, the buffer is packed -into that page. It can be shown that all pages requested from the kernel -(except possibly for the last) are 100% utilized this way. - -The "nonempty" message (supporting poll) ----------------------------------------- - -In order to support the "poll" method (and hence select() ), there is a small -catch regarding the FPGA to host direction: The FPGA may have filled a DMA -buffer with some data, but not submitted that buffer. If the host waited for -the buffer's submission by the FPGA, there would be a possibility that the -FPGA side has sent data, but a select() call would still block, because the -host has not received any notification about this. This is solved with -XILLYMSG_OPCODE_NONEMPTY messages sent by the FPGA when a channel goes from -completely empty to containing some data. - -These messages are used only to support poll() and select(). The IP core can -be configured not to send them for a slight reduction of bandwidth. diff --git a/Documentation/zorro.txt b/Documentation/zorro.txt deleted file mode 100644 index 664072b017e3..000000000000 --- a/Documentation/zorro.txt +++ /dev/null @@ -1,104 +0,0 @@ -======================================== -Writing Device Drivers for Zorro Devices -======================================== - -:Author: Written by Geert Uytterhoeven -:Last revised: September 5, 2003 - - -Introduction ------------- - -The Zorro bus is the bus used in the Amiga family of computers. Thanks to -AutoConfig(tm), it's 100% Plug-and-Play. - -There are two types of Zorro buses, Zorro II and Zorro III: - - - The Zorro II address space is 24-bit and lies within the first 16 MB of the - Amiga's address map. - - - Zorro III is a 32-bit extension of Zorro II, which is backwards compatible - with Zorro II. The Zorro III address space lies outside the first 16 MB. - - -Probing for Zorro Devices -------------------------- - -Zorro devices are found by calling ``zorro_find_device()``, which returns a -pointer to the ``next`` Zorro device with the specified Zorro ID. A probe loop -for the board with Zorro ID ``ZORRO_PROD_xxx`` looks like:: - - struct zorro_dev *z = NULL; - - while ((z = zorro_find_device(ZORRO_PROD_xxx, z))) { - if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE, - "My explanation")) - ... - } - -``ZORRO_WILDCARD`` acts as a wildcard and finds any Zorro device. If your driver -supports different types of boards, you can use a construct like:: - - struct zorro_dev *z = NULL; - - while ((z = zorro_find_device(ZORRO_WILDCARD, z))) { - if (z->id != ZORRO_PROD_xxx1 && z->id != ZORRO_PROD_xxx2 && ...) - continue; - if (!zorro_request_region(z->resource.start+MY_START, MY_SIZE, - "My explanation")) - ... - } - - -Zorro Resources ---------------- - -Before you can access a Zorro device's registers, you have to make sure it's -not yet in use. This is done using the I/O memory space resource management -functions:: - - request_mem_region() - release_mem_region() - -Shortcuts to claim the whole device's address space are provided as well:: - - zorro_request_device - zorro_release_device - - -Accessing the Zorro Address Space ---------------------------------- - -The address regions in the Zorro device resources are Zorro bus address -regions. Due to the identity bus-physical address mapping on the Zorro bus, -they are CPU physical addresses as well. - -The treatment of these regions depends on the type of Zorro space: - - - Zorro II address space is always mapped and does not have to be mapped - explicitly using z_ioremap(). - - Conversion from bus/physical Zorro II addresses to kernel virtual addresses - and vice versa is done using:: - - virt_addr = ZTWO_VADDR(bus_addr); - bus_addr = ZTWO_PADDR(virt_addr); - - - Zorro III address space must be mapped explicitly using z_ioremap() first - before it can be accessed:: - - virt_addr = z_ioremap(bus_addr, size); - ... - z_iounmap(virt_addr); - - -References ----------- - -#. linux/include/linux/zorro.h -#. linux/include/uapi/linux/zorro.h -#. linux/include/uapi/linux/zorro_ids.h -#. linux/arch/m68k/include/asm/zorro.h -#. linux/drivers/zorro -#. /proc/bus/zorro - -- cgit From fb8c5327b3c6c78b74a27a3c42e4f32b2cc30a04 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 14:40:42 -0300 Subject: docs: driver-api: add xilinx driver API documentation The current file there (emmi) provides a description of the driver uAPI and kAPI. Signed-off-by: Mauro Carvalho Chehab --- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/xilinx/eemi.rst | 67 +++++++++++++++++++++++++++++++ Documentation/driver-api/xilinx/index.rst | 16 ++++++++ Documentation/xilinx/eemi.rst | 67 ------------------------------- Documentation/xilinx/index.rst | 17 -------- 5 files changed, 84 insertions(+), 84 deletions(-) create mode 100644 Documentation/driver-api/xilinx/eemi.rst create mode 100644 Documentation/driver-api/xilinx/index.rst delete mode 100644 Documentation/xilinx/eemi.rst delete mode 100644 Documentation/xilinx/index.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index d1c6513dd20d..77322753c1bc 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -93,6 +93,7 @@ available subsections can be seen below. sync_file vfio-mediated-device vfio + xilinx/index xillybus zorro diff --git a/Documentation/driver-api/xilinx/eemi.rst b/Documentation/driver-api/xilinx/eemi.rst new file mode 100644 index 000000000000..9dcbc6f18d75 --- /dev/null +++ b/Documentation/driver-api/xilinx/eemi.rst @@ -0,0 +1,67 @@ +==================================== +Xilinx Zynq MPSoC EEMI Documentation +==================================== + +Xilinx Zynq MPSoC Firmware Interface +------------------------------------- +The zynqmp-firmware node describes the interface to platform firmware. +ZynqMP has an interface to communicate with secure firmware. Firmware +driver provides an interface to firmware APIs. Interface APIs can be +used by any driver to communicate with PMC(Platform Management Controller). + +Embedded Energy Management Interface (EEMI) +---------------------------------------------- +The embedded energy management interface is used to allow software +components running across different processing clusters on a chip or +device to communicate with a power management controller (PMC) on a +device to issue or respond to power management requests. + +EEMI ops is a structure containing all eemi APIs supported by Zynq MPSoC. +The zynqmp-firmware driver maintain all EEMI APIs in zynqmp_eemi_ops +structure. Any driver who want to communicate with PMC using EEMI APIs +can call zynqmp_pm_get_eemi_ops(). + +Example of EEMI ops:: + + /* zynqmp-firmware driver maintain all EEMI APIs */ + struct zynqmp_eemi_ops { + int (*get_api_version)(u32 *version); + int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out); + }; + + static const struct zynqmp_eemi_ops eemi_ops = { + .get_api_version = zynqmp_pm_get_api_version, + .query_data = zynqmp_pm_query_data, + }; + +Example of EEMI ops usage:: + + static const struct zynqmp_eemi_ops *eemi_ops; + u32 ret_payload[PAYLOAD_ARG_CNT]; + int ret; + + eemi_ops = zynqmp_pm_get_eemi_ops(); + if (IS_ERR(eemi_ops)) + return PTR_ERR(eemi_ops); + + ret = eemi_ops->query_data(qdata, ret_payload); + +IOCTL +------ +IOCTL API is for device control and configuration. It is not a system +IOCTL but it is an EEMI API. This API can be used by master to control +any device specific configuration. IOCTL definitions can be platform +specific. This API also manage shared device configuration. + +The following IOCTL IDs are valid for device control: +- IOCTL_SET_PLL_FRAC_MODE 8 +- IOCTL_GET_PLL_FRAC_MODE 9 +- IOCTL_SET_PLL_FRAC_DATA 10 +- IOCTL_GET_PLL_FRAC_DATA 11 + +Refer EEMI API guide [0] for IOCTL specific parameters and other EEMI APIs. + +References +---------- +[0] Embedded Energy Management Interface (EEMI) API guide: + https://www.xilinx.com/support/documentation/user_guides/ug1200-eemi-api.pdf diff --git a/Documentation/driver-api/xilinx/index.rst b/Documentation/driver-api/xilinx/index.rst new file mode 100644 index 000000000000..13f7589ed442 --- /dev/null +++ b/Documentation/driver-api/xilinx/index.rst @@ -0,0 +1,16 @@ + +=========== +Xilinx FPGA +=========== + +.. toctree:: + :maxdepth: 1 + + eemi + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/xilinx/eemi.rst b/Documentation/xilinx/eemi.rst deleted file mode 100644 index 9dcbc6f18d75..000000000000 --- a/Documentation/xilinx/eemi.rst +++ /dev/null @@ -1,67 +0,0 @@ -==================================== -Xilinx Zynq MPSoC EEMI Documentation -==================================== - -Xilinx Zynq MPSoC Firmware Interface -------------------------------------- -The zynqmp-firmware node describes the interface to platform firmware. -ZynqMP has an interface to communicate with secure firmware. Firmware -driver provides an interface to firmware APIs. Interface APIs can be -used by any driver to communicate with PMC(Platform Management Controller). - -Embedded Energy Management Interface (EEMI) ----------------------------------------------- -The embedded energy management interface is used to allow software -components running across different processing clusters on a chip or -device to communicate with a power management controller (PMC) on a -device to issue or respond to power management requests. - -EEMI ops is a structure containing all eemi APIs supported by Zynq MPSoC. -The zynqmp-firmware driver maintain all EEMI APIs in zynqmp_eemi_ops -structure. Any driver who want to communicate with PMC using EEMI APIs -can call zynqmp_pm_get_eemi_ops(). - -Example of EEMI ops:: - - /* zynqmp-firmware driver maintain all EEMI APIs */ - struct zynqmp_eemi_ops { - int (*get_api_version)(u32 *version); - int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out); - }; - - static const struct zynqmp_eemi_ops eemi_ops = { - .get_api_version = zynqmp_pm_get_api_version, - .query_data = zynqmp_pm_query_data, - }; - -Example of EEMI ops usage:: - - static const struct zynqmp_eemi_ops *eemi_ops; - u32 ret_payload[PAYLOAD_ARG_CNT]; - int ret; - - eemi_ops = zynqmp_pm_get_eemi_ops(); - if (IS_ERR(eemi_ops)) - return PTR_ERR(eemi_ops); - - ret = eemi_ops->query_data(qdata, ret_payload); - -IOCTL ------- -IOCTL API is for device control and configuration. It is not a system -IOCTL but it is an EEMI API. This API can be used by master to control -any device specific configuration. IOCTL definitions can be platform -specific. This API also manage shared device configuration. - -The following IOCTL IDs are valid for device control: -- IOCTL_SET_PLL_FRAC_MODE 8 -- IOCTL_GET_PLL_FRAC_MODE 9 -- IOCTL_SET_PLL_FRAC_DATA 10 -- IOCTL_GET_PLL_FRAC_DATA 11 - -Refer EEMI API guide [0] for IOCTL specific parameters and other EEMI APIs. - -References ----------- -[0] Embedded Energy Management Interface (EEMI) API guide: - https://www.xilinx.com/support/documentation/user_guides/ug1200-eemi-api.pdf diff --git a/Documentation/xilinx/index.rst b/Documentation/xilinx/index.rst deleted file mode 100644 index 01cc1a0714df..000000000000 --- a/Documentation/xilinx/index.rst +++ /dev/null @@ -1,17 +0,0 @@ -:orphan: - -=========== -Xilinx FPGA -=========== - -.. toctree:: - :maxdepth: 1 - - eemi - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` -- cgit From c92992fc609fe99d926855eb1945f38ef4ad8e6c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Apr 2019 16:49:11 -0300 Subject: docs: driver-api: add remaining converted dirs to it There are a number of driver-specific descriptions that contain a mix of userspace and kernelspace documentation. Just like we did with other similar subsystems, add them at the driver-api groupset, but don't move the directories. Signed-off-by: Mauro Carvalho Chehab --- Documentation/driver-api/index.rst | 2 ++ Documentation/driver-api/pps.rst | 2 +- Documentation/driver-api/ptp.rst | 2 +- Documentation/index.rst | 3 +++ Documentation/mic/index.rst | 2 -- Documentation/phy/samsung-usb2.rst | 2 -- Documentation/scheduler/index.rst | 2 -- 7 files changed, 7 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 77322753c1bc..1dde9692075c 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -83,6 +83,8 @@ available subsections can be seen below. ntb nvmem parport-lowlevel + pps + ptp pti_intel_mid pwm rfkill diff --git a/Documentation/driver-api/pps.rst b/Documentation/driver-api/pps.rst index 1456d2c32ebd..2d6b99766ee8 100644 --- a/Documentation/driver-api/pps.rst +++ b/Documentation/driver-api/pps.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ====================== PPS - Pulse Per Second diff --git a/Documentation/driver-api/ptp.rst b/Documentation/driver-api/ptp.rst index b6e65d66d37a..a15192e32347 100644 --- a/Documentation/driver-api/ptp.rst +++ b/Documentation/driver-api/ptp.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 =========================================== PTP hardware clock infrastructure for Linux diff --git a/Documentation/index.rst b/Documentation/index.rst index dcdaaff71633..041ffe442960 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -110,6 +110,9 @@ needed). bpf/index usb/index misc-devices/index + mic/index + phy/samsung-usb2 + scheduler/index Architecture-specific documentation ----------------------------------- diff --git a/Documentation/mic/index.rst b/Documentation/mic/index.rst index 082fa8f6a260..3a8d06367ef1 100644 --- a/Documentation/mic/index.rst +++ b/Documentation/mic/index.rst @@ -1,5 +1,3 @@ -:orphan: - ============================================= Intel Many Integrated Core (MIC) architecture ============================================= diff --git a/Documentation/phy/samsung-usb2.rst b/Documentation/phy/samsung-usb2.rst index 98b5952fcb97..c48c8b9797b9 100644 --- a/Documentation/phy/samsung-usb2.rst +++ b/Documentation/phy/samsung-usb2.rst @@ -1,5 +1,3 @@ -:orphan: - ==================================== Samsung USB 2.0 PHY adaptation layer ==================================== diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 058be77a4c34..69074e5de9c4 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -1,5 +1,3 @@ -:orphan: - =============== Linux Scheduler =============== -- cgit From 65388dad1bbb51a4eb6cc91b9fa865b57646fb67 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 16:31:35 -0300 Subject: docs: serial: move it to the driver-api The contents of this directory is mostly driver-api stuff. Signed-off-by: Mauro Carvalho Chehab --- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/serial/cyclades_z.rst | 11 + Documentation/driver-api/serial/driver.rst | 549 ++++++++++++++++++ Documentation/driver-api/serial/index.rst | 32 ++ Documentation/driver-api/serial/moxa-smartio.rst | 615 +++++++++++++++++++++ Documentation/driver-api/serial/n_gsm.rst | 103 ++++ Documentation/driver-api/serial/rocket.rst | 185 +++++++ Documentation/driver-api/serial/serial-iso7816.rst | 90 +++ Documentation/driver-api/serial/serial-rs485.rst | 103 ++++ Documentation/driver-api/serial/tty.rst | 328 +++++++++++ Documentation/serial/cyclades_z.rst | 11 - Documentation/serial/driver.rst | 549 ------------------ Documentation/serial/index.rst | 32 -- Documentation/serial/moxa-smartio.rst | 615 --------------------- Documentation/serial/n_gsm.rst | 103 ---- Documentation/serial/rocket.rst | 185 ------- Documentation/serial/serial-iso7816.rst | 90 --- Documentation/serial/serial-rs485.rst | 103 ---- Documentation/serial/tty.rst | 328 ----------- 19 files changed, 2017 insertions(+), 2016 deletions(-) create mode 100644 Documentation/driver-api/serial/cyclades_z.rst create mode 100644 Documentation/driver-api/serial/driver.rst create mode 100644 Documentation/driver-api/serial/index.rst create mode 100644 Documentation/driver-api/serial/moxa-smartio.rst create mode 100644 Documentation/driver-api/serial/n_gsm.rst create mode 100644 Documentation/driver-api/serial/rocket.rst create mode 100644 Documentation/driver-api/serial/serial-iso7816.rst create mode 100644 Documentation/driver-api/serial/serial-rs485.rst create mode 100644 Documentation/driver-api/serial/tty.rst delete mode 100644 Documentation/serial/cyclades_z.rst delete mode 100644 Documentation/serial/driver.rst delete mode 100644 Documentation/serial/index.rst delete mode 100644 Documentation/serial/moxa-smartio.rst delete mode 100644 Documentation/serial/n_gsm.rst delete mode 100644 Documentation/serial/rocket.rst delete mode 100644 Documentation/serial/serial-iso7816.rst delete mode 100644 Documentation/serial/serial-rs485.rst delete mode 100644 Documentation/serial/tty.rst (limited to 'Documentation') diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 1dde9692075c..cf39b8f9d0f9 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -88,6 +88,7 @@ available subsections can be seen below. pti_intel_mid pwm rfkill + serial/index sgi-ioc4 sm501 smsc_ece1099 diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst new file mode 100644 index 000000000000..532ff67e2f1c --- /dev/null +++ b/Documentation/driver-api/serial/cyclades_z.rst @@ -0,0 +1,11 @@ +================ +Cyclades-Z notes +================ + +The Cyclades-Z must have firmware loaded onto the card before it will +operate. This operation should be performed during system startup, + +The firmware, loader program and the latest device driver code are +available from Cyclades at + + ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/ diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst new file mode 100644 index 000000000000..31bd4e16fb1f --- /dev/null +++ b/Documentation/driver-api/serial/driver.rst @@ -0,0 +1,549 @@ +==================== +Low Level Serial API +==================== + + +This document is meant as a brief overview of some aspects of the new serial +driver. It is not complete, any questions you have should be directed to + + +The reference implementation is contained within amba-pl011.c. + + + +Low Level Serial Hardware Driver +-------------------------------- + +The low level serial hardware driver is responsible for supplying port +information (defined by uart_port) and a set of control methods (defined +by uart_ops) to the core serial driver. The low level driver is also +responsible for handling interrupts for the port, and providing any +console support. + + +Console Support +--------------- + +The serial core provides a few helper functions. This includes identifing +the correct port structure (via uart_get_console) and decoding command line +arguments (uart_parse_options). + +There is also a helper function (uart_console_write) which performs a +character by character write, translating newlines to CRLF sequences. +Driver writers are recommended to use this function rather than implementing +their own version. + + +Locking +------- + +It is the responsibility of the low level hardware driver to perform the +necessary locking using port->lock. There are some exceptions (which +are described in the uart_ops listing below.) + +There are two locks. A per-port spinlock, and an overall semaphore. + +From the core driver perspective, the port->lock locks the following +data:: + + port->mctrl + port->icount + port->state->xmit.head (circ_buf->head) + port->state->xmit.tail (circ_buf->tail) + +The low level driver is free to use this lock to provide any additional +locking. + +The port_sem semaphore is used to protect against ports being added/ +removed or reconfigured at inappropriate times. Since v2.6.27, this +semaphore has been the 'mutex' member of the tty_port struct, and +commonly referred to as the port mutex. + + +uart_ops +-------- + +The uart_ops structure is the main interface between serial_core and the +hardware specific driver. It contains all the methods to control the +hardware. + + tx_empty(port) + This function tests whether the transmitter fifo and shifter + for the port described by 'port' is empty. If it is empty, + this function should return TIOCSER_TEMT, otherwise return 0. + If the port does not support this operation, then it should + return TIOCSER_TEMT. + + Locking: none. + + Interrupts: caller dependent. + + This call must not sleep + + set_mctrl(port, mctrl) + This function sets the modem control lines for port described + by 'port' to the state described by mctrl. The relevant bits + of mctrl are: + + - TIOCM_RTS RTS signal. + - TIOCM_DTR DTR signal. + - TIOCM_OUT1 OUT1 signal. + - TIOCM_OUT2 OUT2 signal. + - TIOCM_LOOP Set the port into loopback mode. + + If the appropriate bit is set, the signal should be driven + active. If the bit is clear, the signal should be driven + inactive. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + get_mctrl(port) + Returns the current state of modem control inputs. The state + of the outputs should not be returned, since the core keeps + track of their state. The state information should include: + + - TIOCM_CAR state of DCD signal + - TIOCM_CTS state of CTS signal + - TIOCM_DSR state of DSR signal + - TIOCM_RI state of RI signal + + The bit is set if the signal is currently driven active. If + the port does not support CTS, DCD or DSR, the driver should + indicate that the signal is permanently active. If RI is + not available, the signal should not be indicated as active. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + stop_tx(port) + Stop transmitting characters. This might be due to the CTS + line becoming inactive or the tty layer indicating we want + to stop transmission due to an XOFF character. + + The driver should stop transmitting characters as soon as + possible. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + start_tx(port) + Start transmitting characters. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + throttle(port) + Notify the serial driver that input buffers for the line discipline are + close to full, and it should somehow signal that no more characters + should be sent to the serial port. + This will be called only if hardware assisted flow control is enabled. + + Locking: serialized with .unthrottle() and termios modification by the + tty layer. + + unthrottle(port) + Notify the serial driver that characters can now be sent to the serial + port without fear of overrunning the input buffers of the line + disciplines. + + This will be called only if hardware assisted flow control is enabled. + + Locking: serialized with .throttle() and termios modification by the + tty layer. + + send_xchar(port,ch) + Transmit a high priority character, even if the port is stopped. + This is used to implement XON/XOFF flow control and tcflow(). If + the serial driver does not implement this function, the tty core + will append the character to the circular buffer and then call + start_tx() / stop_tx() to flush the data out. + + Do not transmit if ch == '\0' (__DISABLED_CHAR). + + Locking: none. + + Interrupts: caller dependent. + + stop_rx(port) + Stop receiving characters; the port is in the process of + being closed. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + enable_ms(port) + Enable the modem status interrupts. + + This method may be called multiple times. Modem status + interrupts should be disabled when the shutdown method is + called. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + break_ctl(port,ctl) + Control the transmission of a break signal. If ctl is + nonzero, the break signal should be transmitted. The signal + should be terminated when another call is made with a zero + ctl. + + Locking: caller holds tty_port->mutex + + startup(port) + Grab any interrupt resources and initialise any low level driver + state. Enable the port for reception. It should not activate + RTS nor DTR; this will be done via a separate call to set_mctrl. + + This method will only be called when the port is initially opened. + + Locking: port_sem taken. + + Interrupts: globally disabled. + + shutdown(port) + Disable the port, disable any break condition that may be in + effect, and free any interrupt resources. It should not disable + RTS nor DTR; this will have already been done via a separate + call to set_mctrl. + + Drivers must not access port->state once this call has completed. + + This method will only be called when there are no more users of + this port. + + Locking: port_sem taken. + + Interrupts: caller dependent. + + flush_buffer(port) + Flush any write buffers, reset any DMA state and stop any + ongoing DMA transfers. + + This will be called whenever the port->state->xmit circular + buffer is cleared. + + Locking: port->lock taken. + + Interrupts: locally disabled. + + This call must not sleep + + set_termios(port,termios,oldtermios) + Change the port parameters, including word length, parity, stop + bits. Update read_status_mask and ignore_status_mask to indicate + the types of events we are interested in receiving. Relevant + termios->c_cflag bits are: + + CSIZE + - word size + CSTOPB + - 2 stop bits + PARENB + - parity enable + PARODD + - odd parity (when PARENB is in force) + CREAD + - enable reception of characters (if not set, + still receive characters from the port, but + throw them away. + CRTSCTS + - if set, enable CTS status change reporting + CLOCAL + - if not set, enable modem status change + reporting. + + Relevant termios->c_iflag bits are: + + INPCK + - enable frame and parity error events to be + passed to the TTY layer. + BRKINT / PARMRK + - both of these enable break events to be + passed to the TTY layer. + + IGNPAR + - ignore parity and framing errors + IGNBRK + - ignore break errors, If IGNPAR is also + set, ignore overrun errors as well. + + The interaction of the iflag bits is as follows (parity error + given as an example): + + =============== ======= ====== ============================= + Parity error INPCK IGNPAR + =============== ======= ====== ============================= + n/a 0 n/a character received, marked as + TTY_NORMAL + None 1 n/a character received, marked as + TTY_NORMAL + Yes 1 0 character received, marked as + TTY_PARITY + Yes 1 1 character discarded + =============== ======= ====== ============================= + + Other flags may be used (eg, xon/xoff characters) if your + hardware supports hardware "soft" flow control. + + Locking: caller holds tty_port->mutex + + Interrupts: caller dependent. + + This call must not sleep + + set_ldisc(port,termios) + Notifier for discipline change. See Documentation/driver-api/serial/tty.rst. + + Locking: caller holds tty_port->mutex + + pm(port,state,oldstate) + Perform any power management related activities on the specified + port. State indicates the new state (defined by + enum uart_pm_state), oldstate indicates the previous state. + + This function should not be used to grab any resources. + + This will be called when the port is initially opened and finally + closed, except when the port is also the system console. This + will occur even if CONFIG_PM is not set. + + Locking: none. + + Interrupts: caller dependent. + + type(port) + Return a pointer to a string constant describing the specified + port, or return NULL, in which case the string 'unknown' is + substituted. + + Locking: none. + + Interrupts: caller dependent. + + release_port(port) + Release any memory and IO region resources currently in use by + the port. + + Locking: none. + + Interrupts: caller dependent. + + request_port(port) + Request any memory and IO region resources required by the port. + If any fail, no resources should be registered when this function + returns, and it should return -EBUSY on failure. + + Locking: none. + + Interrupts: caller dependent. + + config_port(port,type) + Perform any autoconfiguration steps required for the port. `type` + contains a bit mask of the required configuration. UART_CONFIG_TYPE + indicates that the port requires detection and identification. + port->type should be set to the type found, or PORT_UNKNOWN if + no port was detected. + + UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal, + which should be probed using standard kernel autoprobing techniques. + This is not necessary on platforms where ports have interrupts + internally hard wired (eg, system on a chip implementations). + + Locking: none. + + Interrupts: caller dependent. + + verify_port(port,serinfo) + Verify the new serial port information contained within serinfo is + suitable for this port type. + + Locking: none. + + Interrupts: caller dependent. + + ioctl(port,cmd,arg) + Perform any port specific IOCTLs. IOCTL commands must be defined + using the standard numbering system found in + + Locking: none. + + Interrupts: caller dependent. + + poll_init(port) + Called by kgdb to perform the minimal hardware initialization needed + to support poll_put_char() and poll_get_char(). Unlike ->startup() + this should not request interrupts. + + Locking: tty_mutex and tty_port->mutex taken. + + Interrupts: n/a. + + poll_put_char(port,ch) + Called by kgdb to write a single character directly to the serial + port. It can and should block until there is space in the TX FIFO. + + Locking: none. + + Interrupts: caller dependent. + + This call must not sleep + + poll_get_char(port) + Called by kgdb to read a single character directly from the serial + port. If data is available, it should be returned; otherwise + the function should return NO_POLL_CHAR immediately. + + Locking: none. + + Interrupts: caller dependent. + + This call must not sleep + +Other functions +--------------- + +uart_update_timeout(port,cflag,baud) + Update the FIFO drain timeout, port->timeout, according to the + number of bits, parity, stop bits and baud rate. + + Locking: caller is expected to take port->lock + + Interrupts: n/a + +uart_get_baud_rate(port,termios,old,min,max) + Return the numeric baud rate for the specified termios, taking + account of the special 38400 baud "kludge". The B0 baud rate + is mapped to 9600 baud. + + If the baud rate is not within min..max, then if old is non-NULL, + the original baud rate will be tried. If that exceeds the + min..max constraint, 9600 baud will be returned. termios will + be updated to the baud rate in use. + + Note: min..max must always allow 9600 baud to be selected. + + Locking: caller dependent. + + Interrupts: n/a + +uart_get_divisor(port,baud) + Return the divisor (baud_base / baud) for the specified baud + rate, appropriately rounded. + + If 38400 baud and custom divisor is selected, return the + custom divisor instead. + + Locking: caller dependent. + + Interrupts: n/a + +uart_match_port(port1,port2) + This utility function can be used to determine whether two + uart_port structures describe the same port. + + Locking: n/a + + Interrupts: n/a + +uart_write_wakeup(port) + A driver is expected to call this function when the number of + characters in the transmit buffer have dropped below a threshold. + + Locking: port->lock should be held. + + Interrupts: n/a + +uart_register_driver(drv) + Register a uart driver with the core driver. We in turn register + with the tty layer, and initialise the core driver per-port state. + + drv->port should be NULL, and the per-port structures should be + registered using uart_add_one_port after this call has succeeded. + + Locking: none + + Interrupts: enabled + +uart_unregister_driver() + Remove all references to a driver from the core driver. The low + level driver must have removed all its ports via the + uart_remove_one_port() if it registered them with uart_add_one_port(). + + Locking: none + + Interrupts: enabled + +**uart_suspend_port()** + +**uart_resume_port()** + +**uart_add_one_port()** + +**uart_remove_one_port()** + +Other notes +----------- + +It is intended some day to drop the 'unused' entries from uart_port, and +allow low level drivers to register their own individual uart_port's with +the core. This will allow drivers to use uart_port as a pointer to a +structure containing both the uart_port entry with their own extensions, +thus:: + + struct my_port { + struct uart_port port; + int my_stuff; + }; + +Modem control lines via GPIO +---------------------------- + +Some helpers are provided in order to set/get modem control lines via GPIO. + +mctrl_gpio_init(port, idx): + This will get the {cts,rts,...}-gpios from device tree if they are + present and request them, set direction etc, and return an + allocated structure. `devm_*` functions are used, so there's no need + to call mctrl_gpio_free(). + As this sets up the irq handling make sure to not handle changes to the + gpio input lines in your driver, too. + +mctrl_gpio_free(dev, gpios): + This will free the requested gpios in mctrl_gpio_init(). + As `devm_*` functions are used, there's generally no need to call + this function. + +mctrl_gpio_to_gpiod(gpios, gidx) + This returns the gpio_desc structure associated to the modem line + index. + +mctrl_gpio_set(gpios, mctrl): + This will sets the gpios according to the mctrl state. + +mctrl_gpio_get(gpios, mctrl): + This will update mctrl with the gpios values. + +mctrl_gpio_enable_ms(gpios): + Enables irqs and handling of changes to the ms lines. + +mctrl_gpio_disable_ms(gpios): + Disables irqs and handling of changes to the ms lines. diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst new file mode 100644 index 000000000000..33ad10d05b26 --- /dev/null +++ b/Documentation/driver-api/serial/index.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Support for Serial devices +========================== + +.. toctree:: + :maxdepth: 1 + + + driver + tty + +Serial drivers +============== + +.. toctree:: + :maxdepth: 1 + + cyclades_z + moxa-smartio + n_gsm + rocket + serial-iso7816 + serial-rs485 + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst new file mode 100644 index 000000000000..156100f17c3f --- /dev/null +++ b/Documentation/driver-api/serial/moxa-smartio.rst @@ -0,0 +1,615 @@ +============================================================= +MOXA Smartio/Industio Family Device Driver Installation Guide +============================================================= + +.. note:: + + This file is outdated. It needs some care in order to make it + updated to Kernel 5.0 and upper + +Copyright (C) 2008, Moxa Inc. + +Date: 01/21/2008 + +.. Content + + 1. Introduction + 2. System Requirement + 3. Installation + 3.1 Hardware installation + 3.2 Driver files + 3.3 Device naming convention + 3.4 Module driver configuration + 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x. + 3.6 Custom configuration + 3.7 Verify driver installation + 4. Utilities + 5. Setserial + 6. Troubleshooting + +1. Introduction +^^^^^^^^^^^^^^^ + + The Smartio/Industio/UPCI family Linux driver supports following multiport + boards. + + - 2 ports multiport board + CP-102U, CP-102UL, CP-102UF + CP-132U-I, CP-132UL, + CP-132, CP-132I, CP132S, CP-132IS, + CI-132, CI-132I, CI-132IS, + (C102H, C102HI, C102HIS, C102P, CP-102, CP-102S) + + - 4 ports multiport board + CP-104EL, + CP-104UL, CP-104JU, + CP-134U, CP-134U-I, + C104H/PCI, C104HS/PCI, + CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL, + C104H, C104HS, + CI-104J, CI-104JS, + CI-134, CI-134I, CI-134IS, + (C114HI, CT-114I, C104P), + POS-104UL, + CB-114, + CB-134I + + - 8 ports multiport board + CP-118EL, CP-168EL, + CP-118U, CP-168U, + C168H/PCI, + C168H, C168HS, + (C168P), + CB-108 + + This driver and installation procedure have been developed upon Linux Kernel + 2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order + to maintain compatibility, this version has also been properly tested with + RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem + occurs, please contact Moxa at support@moxa.com.tw. + + In addition to device driver, useful utilities are also provided in this + version. They are: + + - msdiag + Diagnostic program for displaying installed Moxa + Smartio/Industio boards. + - msmon + Monitor program to observe data count and line status signals. + - msterm A simple terminal program which is useful in testing serial + ports. + - io-irq.exe + Configuration program to setup ISA boards. Please note that + this program can only be executed under DOS. + + All the drivers and utilities are published in form of source code under + GNU General Public License in this version. Please refer to GNU General + Public License announcement in each source code file for more detail. + + In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/. + + This version of driver can be installed as Loadable Module (Module driver) + or built-in into kernel (Static driver). You may refer to following + installation procedure for suitable one. Before you install the driver, + please refer to hardware installation procedure in the User's Manual. + + We assume the user should be familiar with following documents. + + - Serial-HOWTO + - Kernel-HOWTO + +2. System Requirement +^^^^^^^^^^^^^^^^^^^^^ + + - Hardware platform: Intel x86 machine + - Kernel version: 2.4.x or 2.6.x + - gcc version 2.72 or later + - Maximum 4 boards can be installed in combination + +3. Installation +^^^^^^^^^^^^^^^ + +3.1 Hardware installation +========================= + + There are two types of buses, ISA and PCI, for Smartio/Industio + family multiport board. + +ISA board +--------- + + You'll have to configure CAP address, I/O address, Interrupt Vector + as well as IRQ before installing this driver. Please refer to hardware + installation procedure in User's Manual before proceed any further. + Please make sure the JP1 is open after the ISA board is set properly. + +PCI/UPCI board +-------------- + + You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict + with other ISA devices. Please refer to hardware installation + procedure in User's Manual in advance. + +PCI IRQ Sharing +--------------- + + Each port within the same multiport board shares the same IRQ. Up to + 4 Moxa Smartio/Industio PCI Family multiport boards can be installed + together on one system and they can share the same IRQ. + + +3.2 Driver files +================ + + The driver file may be obtained from ftp, CD-ROM or floppy disk. The + first step, anyway, is to copy driver file "mxser.tgz" into specified + directory. e.g. /moxa. The execute commands as below:: + + # cd / + # mkdir moxa + # cd /moxa + # tar xvf /dev/fd0 + +or:: + + # cd / + # mkdir moxa + # cd /moxa + # cp /mnt/cdrom//mxser.tgz . + # tar xvfz mxser.tgz + + +3.3 Device naming convention +============================ + + You may find all the driver and utilities files in /moxa/mxser. + Following installation procedure depends on the model you'd like to + run the driver. If you prefer module driver, please refer to 3.4. + If static driver is required, please refer to 3.5. + +Dialin and callout port +----------------------- + + This driver remains traditional serial device properties. There are + two special file name for each serial port. One is dial-in port + which is named "ttyMxx". For callout port, the naming convention + is "cumxx". + +Device naming when more than 2 boards installed +----------------------------------------------- + + Naming convention for each Smartio/Industio multiport board is + pre-defined as below. + + ============ =============== ============== + Board Num. Dial-in Port Callout port + 1st board ttyM0 - ttyM7 cum0 - cum7 + 2nd board ttyM8 - ttyM15 cum8 - cum15 + 3rd board ttyM16 - ttyM23 cum16 - cum23 + 4th board ttyM24 - ttym31 cum24 - cum31 + ============ =============== ============== + +.. note:: + + Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM* + device instead. + +Board sequence +-------------- + + This driver will activate ISA boards according to the parameter set + in the driver. After all specified ISA board activated, PCI board + will be installed in the system automatically driven. + Therefore the board number is sorted by the CAP address of ISA boards. + For PCI boards, their sequence will be after ISA boards and C168H/PCI + has higher priority than C104H/PCI boards. + +3.4 Module driver configuration +=============================== + + Module driver is easiest way to install. If you prefer static driver + installation, please skip this paragraph. + + + ------------- Prepare to use the MOXA driver -------------------- + +3.4.1 Create tty device with correct major number +------------------------------------------------- + + Before using MOXA driver, your system must have the tty devices + which are created with driver's major number. We offer one shell + script "msmknod" to simplify the procedure. + This step is only needed to be executed once. But you still + need to do this procedure when: + + a. You change the driver's major number. Please refer the "3.7" + section. + b. Your total installed MOXA boards number is changed. Maybe you + add/delete one MOXA board. + c. You want to change the tty name. This needs to modify the + shell script "msmknod" + + The procedure is:: + + # cd /moxa/mxser/driver + # ./msmknod + + This shell script will require the major number for dial-in + device and callout device to create tty device. You also need + to specify the total installed MOXA board number. Default major + numbers for dial-in device and callout device are 30, 35. If + you need to change to other number, please refer section "3.7" + for more detailed procedure. + Msmknod will delete any special files occupying the same device + naming. + +3.4.2 Build the MOXA driver and utilities +----------------------------------------- + + Before using the MOXA driver and utilities, you need compile the + all the source code. This step is only need to be executed once. + But you still re-compile the source code if you modify the source + code. For example, if you change the driver's major number (see + "3.7" section), then you need to do this step again. + + Find "Makefile" in /moxa/mxser, then run + + # make clean; make install + + ..note:: + + For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1: + # make clean; make installsp1 + + For Red Hat Enterprise Linux AS4/ES4/WS4: + # make clean; make installsp2 + + The driver files "mxser.o" and utilities will be properly compiled + and copied to system directories respectively. + +------------- Load MOXA driver-------------------- + +3.4.3 Load the MOXA driver +-------------------------- + + :: + + # modprobe mxser + + will activate the module driver. You may run "lsmod" to check + if "mxser" is activated. If the MOXA board is ISA board, the + is needed. Please refer to section "3.4.5" for more + information. + +------------- Load MOXA driver on boot -------------------- + +3.4.4 Load the mxser driver +--------------------------- + + + For the above description, you may manually execute + "modprobe mxser" to activate this driver and run + "rmmod mxser" to remove it. + + However, it's better to have a boot time configuration to + eliminate manual operation. Boot time configuration can be + achieved by rc file. We offer one "rc.mxser" file to simplify + the procedure under "moxa/mxser/driver". + + But if you use ISA board, please modify the "modprobe ..." command + to add the argument (see "3.4.5" section). After modifying the + rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser" + manually to make sure the modification is ok. If any error + encountered, please try to modify again. If the modification is + completed, follow the below step. + + Run following command for setting rc files:: + + # cd /moxa/mxser/driver + # cp ./rc.mxser /etc/rc.d + # cd /etc/rc.d + + Check "rc.serial" is existed or not. If "rc.serial" doesn't exist, + create it by vi, run "chmod 755 rc.serial" to change the permission. + + Add "/etc/rc.d/rc.mxser" in last line. + + Reboot and check if moxa.o activated by "lsmod" command. + +3.4.5. specify CAP address +-------------------------- + + If you'd like to drive Smartio/Industio ISA boards in the system, + you'll have to add parameter to specify CAP address of given + board while activating "mxser.o". The format for parameters are + as follows.:: + + modprobe mxser ioaddr=0x???,0x???,0x???,0x??? + | | | | + | | | +- 4th ISA board + | | +------ 3rd ISA board + | +------------ 2nd ISA board + +-------------------1st ISA board + +3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x +================================================================ + + Note: + To use static driver, you must install the linux kernel + source package. + +3.5.1 Backup the built-in driver in the kernel +---------------------------------------------- + + :: + + # cd /usr/src/linux/drivers/char + # mv mxser.c mxser.c.old + + For Red Hat 7.x user, you need to create link: + # cd /usr/src + # ln -s linux-2.4 linux + +3.5.2 Create link +----------------- + :: + + # cd /usr/src/linux/drivers/char + # ln -s /moxa/mxser/driver/mxser.c mxser.c + +3.5.3 Add CAP address list for ISA boards. +------------------------------------------ + + For PCI boards user, please skip this step. + + In module mode, the CAP address for ISA board is given by + parameter. In static driver configuration, you'll have to + assign it within driver's source code. If you will not + install any ISA boards, you may skip to next portion. + The instructions to modify driver source code are as + below. + + a. run:: + + # cd /moxa/mxser/driver + # vi mxser.c + + b. Find the array mxserBoardCAP[] as below:: + + static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00}; + + c. Change the address within this array using vi. For + example, to driver 2 ISA boards with CAP address + 0x280 and 0x180 as 1st and 2nd board. Just to change + the source code as follows:: + + static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00}; + +3.5.4 Setup kernel configuration +-------------------------------- + + Configure the kernel:: + + # cd /usr/src/linux + # make menuconfig + + You will go into a menu-driven system. Please select [Character + devices][Non-standard serial port support], enable the [Moxa + SmartIO support] driver with "[*]" for built-in (not "[M]"), then + select [Exit] to exit this program. + +3.5.5 Rebuild kernel +-------------------- + + The following are for Linux kernel rebuilding, for your + reference only. + + For appropriate details, please refer to the Linux document: + + a. Run the following commands:: + + cd /usr/src/linux + make clean # take a few minutes + make dep # take a few minutes + make bzImage # take probably 10-20 minutes + make install # copy boot image to correct position + + f. Please make sure the boot kernel (vmlinuz) is in the + correct position. + g. If you use 'lilo' utility, you should check /etc/lilo.conf + 'image' item specified the path which is the 'vmlinuz' path, + or you will load wrong (or old) boot kernel image (vmlinuz). + After checking /etc/lilo.conf, please run "lilo". + + Note that if the result of "make bzImage" is ERROR, then you have to + go back to Linux configuration Setup. Type "make menuconfig" in + directory /usr/src/linux. + + +3.5.6 Make tty device and special file +-------------------------------------- + + :: + # cd /moxa/mxser/driver + # ./msmknod + +3.5.7 Make utility +------------------ + + :: + + # cd /moxa/mxser/utility + # make clean; make install + +3.5.8 Reboot +------------ + + + +3.6 Custom configuration +======================== + + Although this driver already provides you default configuration, you + still can change the device name and major number. The instruction to + change these parameters are shown as below. + +a. Change Device name + + If you'd like to use other device names instead of default naming + convention, all you have to do is to modify the internal code + within the shell script "msmknod". First, you have to open "msmknod" + by vi. Locate each line contains "ttyM" and "cum" and change them + to the device name you desired. "msmknod" creates the device names + you need next time executed. + +b. Change Major number + + If major number 30 and 35 had been occupied, you may have to select + 2 free major numbers for this driver. There are 3 steps to change + major numbers. + +3.6.1 Find free major numbers +----------------------------- + + In /proc/devices, you may find all the major numbers occupied + in the system. Please select 2 major numbers that are available. + e.g. 40, 45. + +3.6.2 Create special files +-------------------------- + + Run /moxa/mxser/driver/msmknod to create special files with + specified major numbers. + +3.6.3 Modify driver with new major number +----------------------------------------- + + Run vi to open /moxa/mxser/driver/mxser.c. Locate the line + contains "MXSERMAJOR". Change the content as below:: + + #define MXSERMAJOR 40 + #define MXSERCUMAJOR 45 + + 3.6.4 Run "make clean; make install" in /moxa/mxser/driver. + +3.7 Verify driver installation +============================== + + You may refer to /var/log/messages to check the latest status + log reported by this driver whenever it's activated. + +4. Utilities +^^^^^^^^^^^^ + + There are 3 utilities contained in this driver. They are msdiag, msmon and + msterm. These 3 utilities are released in form of source code. They should + be compiled into executable file and copied into /usr/bin. + + Before using these utilities, please load driver (refer 3.4 & 3.5) and + make sure you had run the "msmknod" utility. + +msdiag - Diagnostic +=================== + + This utility provides the function to display what Moxa Smartio/Industio + board found by driver in the system. + +msmon - Port Monitoring +======================= + + This utility gives the user a quick view about all the MOXA ports' + activities. One can easily learn each port's total received/transmitted + (Rx/Tx) character count since the time when the monitoring is started. + + Rx/Tx throughputs per second are also reported in interval basis (e.g. + the last 5 seconds) and in average basis (since the time the monitoring + is started). You can reset all ports' count by key. <+> <-> + (plus/minus) keys to change the displaying time interval. Press + on the port, that cursor stay, to view the port's communication + parameters, signal status, and input/output queue. + +msterm - Terminal Emulation +=========================== + + This utility provides data sending and receiving ability of all tty ports, + especially for MOXA ports. It is quite useful for testing simple + application, for example, sending AT command to a modem connected to the + port or used as a terminal for login purpose. Note that this is only a + dumb terminal emulation without handling full screen operation. + +5. Setserial +^^^^^^^^^^^^ + + Supported Setserial parameters are listed as below. + + ============== ========================================================= + uart set UART type(16450-->disable FIFO, 16550A-->enable FIFO) + close_delay set the amount of time(in 1/100 of a second) that DTR + should be kept low while being closed. + closing_wait set the amount of time(in 1/100 of a second) that the + serial port should wait for data to be drained while + being closed, before the receiver is disable. + spd_hi Use 57.6kb when the application requests 38.4kb. + spd_vhi Use 115.2kb when the application requests 38.4kb. + spd_shi Use 230.4kb when the application requests 38.4kb. + spd_warp Use 460.8kb when the application requests 38.4kb. + spd_normal Use 38.4kb when the application requests 38.4kb. + spd_cust Use the custom divisor to set the speed when the + application requests 38.4kb. + divisor This option set the custom division. + baud_base This option set the base baud rate. + ============== ========================================================= + +6. Troubleshooting +^^^^^^^^^^^^^^^^^^ + + The boot time error messages and solutions are stated as clearly as + possible. If all the possible solutions fail, please contact our technical + support team to get more help. + + + Error msg: + More than 4 Moxa Smartio/Industio family boards found. Fifth board + and after are ignored. + + Solution: + To avoid this problem, please unplug fifth and after board, because Moxa + driver supports up to 4 boards. + + Error msg: + Request_irq fail, IRQ(?) may be conflict with another device. + + Solution: + Other PCI or ISA devices occupy the assigned IRQ. If you are not sure + which device causes the situation, please check /proc/interrupts to find + free IRQ and simply change another free IRQ for Moxa board. + + Error msg: + Board #: C1xx Series(CAP=xxx) interrupt number invalid. + + Solution: + Each port within the same multiport board shares the same IRQ. Please set + one IRQ (IRQ doesn't equal to zero) for one Moxa board. + + Error msg: + No interrupt vector be set for Moxa ISA board(CAP=xxx). + + Solution: + Moxa ISA board needs an interrupt vector.Please refer to user's manual + "Hardware Installation" chapter to set interrupt vector. + + Error msg: + Couldn't install MOXA Smartio/Industio family driver! + + Solution: + Load Moxa driver fail, the major number may conflict with other devices. + Please refer to previous section 3.7 to change a free major number for + Moxa driver. + + Error msg: + Couldn't install MOXA Smartio/Industio family callout driver! + + Solution: + Load Moxa callout driver fail, the callout device major number may + conflict with other devices. Please refer to previous section 3.7 to + change a free callout device major number for Moxa driver. diff --git a/Documentation/driver-api/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst new file mode 100644 index 000000000000..f3ad9fd26408 --- /dev/null +++ b/Documentation/driver-api/serial/n_gsm.rst @@ -0,0 +1,103 @@ +============================== +GSM 0710 tty multiplexor HOWTO +============================== + +This line discipline implements the GSM 07.10 multiplexing protocol +detailed in the following 3GPP document: + + http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip + +This document give some hints on how to use this driver with GPRS and 3G +modems connected to a physical serial port. + +How to use it +------------- +1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through + its serial port. Depending on the modem used, you can pass more or less + parameters to this command, +2. switch the serial line to using the n_gsm line discipline by using + TIOCSETD ioctl, +3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl, + +Major parts of the initialization program : +(a good starting point is util-linux-ng/sys-utils/ldattach.c):: + + #include + #define N_GSM0710 21 /* GSM 0710 Mux */ + #define DEFAULT_SPEED B115200 + #define SERIAL_PORT /dev/ttyS0 + + int ldisc = N_GSM0710; + struct gsm_config c; + struct termios configuration; + + /* open the serial port connected to the modem */ + fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY); + + /* configure the serial port : speed, flow control ... */ + + /* send the AT commands to switch the modem to CMUX mode + and check that it's successful (should return OK) */ + write(fd, "AT+CMUX=0\r", 10); + + /* experience showed that some modems need some time before + being able to answer to the first MUX packet so a delay + may be needed here in some case */ + sleep(3); + + /* use n_gsm line discipline */ + ioctl(fd, TIOCSETD, &ldisc); + + /* get n_gsm configuration */ + ioctl(fd, GSMIOC_GETCONF, &c); + /* we are initiator and need encoding 0 (basic) */ + c.initiator = 1; + c.encapsulation = 0; + /* our modem defaults to a maximum size of 127 bytes */ + c.mru = 127; + c.mtu = 127; + /* set the new configuration */ + ioctl(fd, GSMIOC_SETCONF, &c); + + /* and wait for ever to keep the line discipline enabled */ + daemon(0,0); + pause(); + +4. create the devices corresponding to the "virtual" serial ports (take care, + each modem has its configuration and some DLC have dedicated functions, + for example GPS), starting with minor 1 (DLC0 is reserved for the management + of the mux):: + + MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}` + for i in `seq 1 4`; do + mknod /dev/ttygsm$i c $MAJOR $i + done + +5. use these devices as plain serial ports. + + for example, it's possible: + + - and to use gnokii to send / receive SMS on ttygsm1 + - to use ppp to establish a datalink on ttygsm2 + +6. first close all virtual ports before closing the physical port. + + Note that after closing the physical port the modem is still in multiplexing + mode. This may prevent a successful re-opening of the port later. To avoid + this situation either reset the modem if your hardware allows that or send + a disconnect command frame manually before initializing the multiplexing mode + for the second time. The byte sequence for the disconnect command frame is:: + + 0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9. + +Additional Documentation +------------------------ +More practical details on the protocol and how it's supported by industrial +modems can be found in the following documents : + +- http://www.telit.com/module/infopool/download.php?id=616 +- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf +- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx +- http://wm.sim.com/sim/News/photo/2010721161442.pdf + +11-03-08 - Eric Bénard - diff --git a/Documentation/driver-api/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst new file mode 100644 index 000000000000..23761eae4282 --- /dev/null +++ b/Documentation/driver-api/serial/rocket.rst @@ -0,0 +1,185 @@ +================================================ +Comtrol(tm) RocketPort(R)/RocketModem(TM) Series +================================================ + +Device Driver for the Linux Operating System +============================================ + +Product overview +---------------- + +This driver provides a loadable kernel driver for the Comtrol RocketPort +and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32 +high-speed serial ports or modems. This driver supports up to a combination +of four RocketPort or RocketModems boards in one machine simultaneously. +This file assumes that you are using the RocketPort driver which is +integrated into the kernel sources. + +The driver can also be installed as an external module using the usual +"make;make install" routine. This external module driver, obtainable +from the Comtrol website listed below, is useful for updating the driver +or installing it into kernels which do not have the driver configured +into them. Installations instructions for the external module +are in the included README and HW_INSTALL files. + +RocketPort ISA and RocketModem II PCI boards currently are only supported by +this driver in module form. + +The RocketPort ISA board requires I/O ports to be configured by the DIP +switches on the board. See the section "ISA Rocketport Boards" below for +information on how to set the DIP switches. + +You pass the I/O port to the driver using the following module parameters: + +board1: + I/O port for the first ISA board +board2: + I/O port for the second ISA board +board3: + I/O port for the third ISA board +board4: + I/O port for the fourth ISA board + +There is a set of utilities and scripts provided with the external driver +(downloadable from http://www.comtrol.com) that ease the configuration and +setup of the ISA cards. + +The RocketModem II PCI boards require firmware to be loaded into the card +before it will function. The driver has only been tested as a module for this +board. + +Installation Procedures +----------------------- + +RocketPort/RocketModem PCI cards require no driver configuration, they are +automatically detected and configured. + +The RocketPort driver can be installed as a module (recommended) or built +into the kernel. This is selected, as for other drivers, through the `make config` +command from the root of the Linux source tree during the kernel build process. + +The RocketPort/RocketModem serial ports installed by this driver are assigned +device major number 46, and will be named /dev/ttyRx, where x is the port number +starting at zero (ex. /dev/ttyR0, /devttyR1, ...). If you have multiple cards +installed in the system, the mapping of port names to serial ports is displayed +in the system log at /var/log/messages. + +If installed as a module, the module must be loaded. This can be done +manually by entering "modprobe rocket". To have the module loaded automatically +upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line +"alias char-major-46 rocket". + +In order to use the ports, their device names (nodes) must be created with mknod. +This is only required once, the system will retain the names once created. To +create the RocketPort/RocketModem device names, use the command +"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero. + +For example:: + + > mknod /dev/ttyR0 c 46 0 + > mknod /dev/ttyR1 c 46 1 + > mknod /dev/ttyR2 c 46 2 + +The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes) +for you:: + + >/dev/MAKEDEV ttyR + +ISA Rocketport Boards +--------------------- + +You must assign and configure the I/O addresses used by the ISA Rocketport +card before installing and using it. This is done by setting a set of DIP +switches on the Rocketport board. + + +Setting the I/O address +----------------------- + +Before installing RocketPort(R) or RocketPort RA boards, you must find +a range of I/O addresses for it to use. The first RocketPort card +requires a 68-byte contiguous block of I/O addresses, starting at one +of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h, +0x300h, 0x340h, 0x380h. This I/O address must be reflected in the DIP +switches of *all* of the Rocketport cards. + +The second, third, and fourth RocketPort cards require a 64-byte +contiguous block of I/O addresses, starting at one of the following +I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h, +0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h. The I/O address used by the +second, third, and fourth Rocketport cards (if present) are set via +software control. The DIP switch settings for the I/O address must be +set to the value of the first Rocketport cards. + +In order to distinguish each of the card from the others, each card +must have a unique board ID set on the dip switches. The first +Rocketport board must be set with the DIP switches corresponding to +the first board, the second board must be set with the DIP switches +corresponding to the second board, etc. IMPORTANT: The board ID is +the only place where the DIP switch settings should differ between the +various Rocketport boards in a system. + +The I/O address range used by any of the RocketPort cards must not +conflict with any other cards in the system, including other +RocketPort cards. Below, you will find a list of commonly used I/O +address ranges which may be in use by other devices in your system. +On a Linux system, "cat /proc/ioports" will also be helpful in +identifying what I/O addresses are being used by devices on your +system. + +Remember, the FIRST RocketPort uses 68 I/O addresses. So, if you set it +for 0x100, it will occupy 0x100 to 0x143. This would mean that you +CAN NOT set the second, third or fourth board for address 0x140 since +the first 4 bytes of that range are used by the first board. You would +need to set the second, third, or fourth board to one of the next available +blocks such as 0x180. + +RocketPort and RocketPort RA SW1 Settings:: + + +-------------------------------+ + | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | + +-------+-------+---------------+ + | Unused| Card | I/O Port Block| + +-------------------------------+ + + DIP Switches DIP Switches + 7 8 6 5 + =================== =================== + On On UNUSED, MUST BE ON. On On First Card <==== Default + On Off Second Card + Off On Third Card + Off Off Fourth Card + + DIP Switches I/O Address Range + 4 3 2 1 Used by the First Card + ===================================== + On Off On Off 100-143 + On Off Off On 140-183 + On Off Off Off 180-1C3 <==== Default + Off On On Off 200-243 + Off On Off On 240-283 + Off On Off Off 280-2C3 + Off Off On Off 300-343 + Off Off Off On 340-383 + Off Off Off Off 380-3C3 + +Reporting Bugs +-------------- + +For technical support, please provide the following +information: Driver version, kernel release, distribution of +kernel, and type of board you are using. Error messages and log +printouts port configuration details are especially helpful. + +USA: + :Phone: (612) 494-4100 + :FAX: (612) 494-4199 + :email: support@comtrol.com + +Comtrol Europe: + :Phone: +44 (0) 1 869 323-220 + :FAX: +44 (0) 1 869 323-211 + :email: support@comtrol.co.uk + +Web: http://www.comtrol.com +FTP: ftp.comtrol.com diff --git a/Documentation/driver-api/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst new file mode 100644 index 000000000000..d990143de0c6 --- /dev/null +++ b/Documentation/driver-api/serial/serial-iso7816.rst @@ -0,0 +1,90 @@ +============================= +ISO7816 Serial Communications +============================= + +1. Introduction +=============== + + ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC) + also known as smart cards. + +2. Hardware-related considerations +================================== + + Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of + handling communication with a smart card. + + For these microcontrollers, the Linux driver should be made capable of + working in both modes, and proper ioctls (see later) should be made + available at user-level to allow switching from one mode to the other, and + vice versa. + +3. Data Structures Already Available in the Kernel +================================================== + + The Linux kernel provides the serial_iso7816 structure (see [1]) to handle + ISO7816 communications. This data structure is used to set and configure + ISO7816 parameters in ioctls. + + Any driver for devices capable of working both as RS232 and ISO7816 should + implement the iso7816_config callback in the uart_port structure. The + serial_core calls iso7816_config to do the device specific part in response + to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config + callback receives a pointer to struct serial_iso7816. + +4. Usage from user-level +======================== + + From user-level, ISO7816 configuration can be get/set using the previous + ioctls. For instance, to set ISO7816 you can use the following code:: + + #include + + /* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */ + #include + + /* Open your specific device (e.g., /dev/mydevice): */ + int fd = open ("/dev/mydevice", O_RDWR); + if (fd < 0) { + /* Error handling. See errno. */ + } + + struct serial_iso7816 iso7816conf; + + /* Reserved fields as to be zeroed */ + memset(&iso7816conf, 0, sizeof(iso7816conf)); + + /* Enable ISO7816 mode: */ + iso7816conf.flags |= SER_ISO7816_ENABLED; + + /* Select the protocol: */ + /* T=0 */ + iso7816conf.flags |= SER_ISO7816_T(0); + /* or T=1 */ + iso7816conf.flags |= SER_ISO7816_T(1); + + /* Set the guard time: */ + iso7816conf.tg = 2; + + /* Set the clock frequency*/ + iso7816conf.clk = 3571200; + + /* Set transmission factors: */ + iso7816conf.sc_fi = 372; + iso7816conf.sc_di = 1; + + if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) { + /* Error handling. See errno. */ + } + + /* Use read() and write() syscalls here... */ + + /* Close the device when finished: */ + if (close (fd) < 0) { + /* Error handling. See errno. */ + } + +5. References +============= + + [1] include/uapi/linux/serial.h diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst new file mode 100644 index 000000000000..6bc824f948f9 --- /dev/null +++ b/Documentation/driver-api/serial/serial-rs485.rst @@ -0,0 +1,103 @@ +=========================== +RS485 Serial Communications +=========================== + +1. Introduction +=============== + + EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the + electrical characteristics of drivers and receivers for use in balanced + digital multipoint systems. + This standard is widely used for communications in industrial automation + because it can be used effectively over long distances and in electrically + noisy environments. + +2. Hardware-related Considerations +================================== + + Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in + half-duplex mode capable of automatically controlling line direction by + toggling RTS or DTR signals. That can be used to control external + half-duplex hardware like an RS485 transceiver or any RS232-connected + half-duplex devices like some modems. + + For these microcontrollers, the Linux driver should be made capable of + working in both modes, and proper ioctls (see later) should be made + available at user-level to allow switching from one mode to the other, and + vice versa. + +3. Data Structures Already Available in the Kernel +================================================== + + The Linux kernel provides the serial_rs485 structure (see [1]) to handle + RS485 communications. This data structure is used to set and configure RS485 + parameters in the platform data and in ioctls. + + The device tree can also provide RS485 boot time parameters (see [2] + for bindings). The driver is in charge of filling this data structure from + the values given by the device tree. + + Any driver for devices capable of working both as RS232 and RS485 should + implement the rs485_config callback in the uart_port structure. The + serial_core calls rs485_config to do the device specific part in response + to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback + receives a pointer to struct serial_rs485. + +4. Usage from user-level +======================== + + From user-level, RS485 configuration can be get/set using the previous + ioctls. For instance, to set RS485 you can use the following code:: + + #include + + /* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */ + #include + + /* Open your specific device (e.g., /dev/mydevice): */ + int fd = open ("/dev/mydevice", O_RDWR); + if (fd < 0) { + /* Error handling. See errno. */ + } + + struct serial_rs485 rs485conf; + + /* Enable RS485 mode: */ + rs485conf.flags |= SER_RS485_ENABLED; + + /* Set logical level for RTS pin equal to 1 when sending: */ + rs485conf.flags |= SER_RS485_RTS_ON_SEND; + /* or, set logical level for RTS pin equal to 0 when sending: */ + rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND); + + /* Set logical level for RTS pin equal to 1 after sending: */ + rs485conf.flags |= SER_RS485_RTS_AFTER_SEND; + /* or, set logical level for RTS pin equal to 0 after sending: */ + rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND); + + /* Set rts delay before send, if needed: */ + rs485conf.delay_rts_before_send = ...; + + /* Set rts delay after send, if needed: */ + rs485conf.delay_rts_after_send = ...; + + /* Set this flag if you want to receive data even while sending data */ + rs485conf.flags |= SER_RS485_RX_DURING_TX; + + if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) { + /* Error handling. See errno. */ + } + + /* Use read() and write() syscalls here... */ + + /* Close the device when finished: */ + if (close (fd) < 0) { + /* Error handling. See errno. */ + } + +5. References +============= + + [1] include/uapi/linux/serial.h + + [2] Documentation/devicetree/bindings/serial/rs485.txt diff --git a/Documentation/driver-api/serial/tty.rst b/Documentation/driver-api/serial/tty.rst new file mode 100644 index 000000000000..dd972caacf3e --- /dev/null +++ b/Documentation/driver-api/serial/tty.rst @@ -0,0 +1,328 @@ +================= +The Lockronomicon +================= + +Your guide to the ancient and twisted locking policies of the tty layer and +the warped logic behind them. Beware all ye who read on. + + +Line Discipline +--------------- + +Line disciplines are registered with tty_register_ldisc() passing the +discipline number and the ldisc structure. At the point of registration the +discipline must be ready to use and it is possible it will get used before +the call returns success. If the call returns an error then it won't get +called. Do not re-use ldisc numbers as they are part of the userspace ABI +and writing over an existing ldisc will cause demons to eat your computer. +After the return the ldisc data has been copied so you may free your own +copy of the structure. You must not re-register over the top of the line +discipline even with the same data or your computer again will be eaten by +demons. + +In order to remove a line discipline call tty_unregister_ldisc(). +In ancient times this always worked. In modern times the function will +return -EBUSY if the ldisc is currently in use. Since the ldisc referencing +code manages the module counts this should not usually be a concern. + +Heed this warning: the reference count field of the registered copies of the +tty_ldisc structure in the ldisc table counts the number of lines using this +discipline. The reference count of the tty_ldisc structure within a tty +counts the number of active users of the ldisc at this instant. In effect it +counts the number of threads of execution within an ldisc method (plus those +about to enter and exit although this detail matters not). + +Line Discipline Methods +----------------------- + +TTY side interfaces +^^^^^^^^^^^^^^^^^^^ + +======================= ======================================================= +open() Called when the line discipline is attached to + the terminal. No other call into the line + discipline for this tty will occur until it + completes successfully. Should initialize any + state needed by the ldisc, and set receive_room + in the tty_struct to the maximum amount of data + the line discipline is willing to accept from the + driver with a single call to receive_buf(). + Returning an error will prevent the ldisc from + being attached. Can sleep. + +close() This is called on a terminal when the line + discipline is being unplugged. At the point of + execution no further users will enter the + ldisc code for this tty. Can sleep. + +hangup() Called when the tty line is hung up. + The line discipline should cease I/O to the tty. + No further calls into the ldisc code will occur. + The return value is ignored. Can sleep. + +read() (optional) A process requests reading data from + the line. Multiple read calls may occur in parallel + and the ldisc must deal with serialization issues. + If not defined, the process will receive an EIO + error. May sleep. + +write() (optional) A process requests writing data to the + line. Multiple write calls are serialized by the + tty layer for the ldisc. If not defined, the + process will receive an EIO error. May sleep. + +flush_buffer() (optional) May be called at any point between + open and close, and instructs the line discipline + to empty its input buffer. + +set_termios() (optional) Called on termios structure changes. + The caller passes the old termios data and the + current data is in the tty. Called under the + termios semaphore so allowed to sleep. Serialized + against itself only. + +poll() (optional) Check the status for the poll/select + calls. Multiple poll calls may occur in parallel. + May sleep. + +ioctl() (optional) Called when an ioctl is handed to the + tty layer that might be for the ldisc. Multiple + ioctl calls may occur in parallel. May sleep. + +compat_ioctl() (optional) Called when a 32 bit ioctl is handed + to the tty layer that might be for the ldisc. + Multiple ioctl calls may occur in parallel. + May sleep. +======================= ======================================================= + +Driver Side Interfaces +^^^^^^^^^^^^^^^^^^^^^^ + +======================= ======================================================= +receive_buf() (optional) Called by the low-level driver to hand + a buffer of received bytes to the ldisc for + processing. The number of bytes is guaranteed not + to exceed the current value of tty->receive_room. + All bytes must be processed. + +receive_buf2() (optional) Called by the low-level driver to hand + a buffer of received bytes to the ldisc for + processing. Returns the number of bytes processed. + + If both receive_buf() and receive_buf2() are + defined, receive_buf2() should be preferred. + +write_wakeup() May be called at any point between open and close. + The TTY_DO_WRITE_WAKEUP flag indicates if a call + is needed but always races versus calls. Thus the + ldisc must be careful about setting order and to + handle unexpected calls. Must not sleep. + + The driver is forbidden from calling this directly + from the ->write call from the ldisc as the ldisc + is permitted to call the driver write method from + this function. In such a situation defer it. + +dcd_change() Report to the tty line the current DCD pin status + changes and the relative timestamp. The timestamp + cannot be NULL. +======================= ======================================================= + + +Driver Access +^^^^^^^^^^^^^ + +Line discipline methods can call the following methods of the underlying +hardware driver through the function pointers within the tty->driver +structure: + +======================= ======================================================= +write() Write a block of characters to the tty device. + Returns the number of characters accepted. The + character buffer passed to this method is already + in kernel space. + +put_char() Queues a character for writing to the tty device. + If there is no room in the queue, the character is + ignored. + +flush_chars() (Optional) If defined, must be called after + queueing characters with put_char() in order to + start transmission. + +write_room() Returns the numbers of characters the tty driver + will accept for queueing to be written. + +ioctl() Invoke device specific ioctl. + Expects data pointers to refer to userspace. + Returns ENOIOCTLCMD for unrecognized ioctl numbers. + +set_termios() Notify the tty driver that the device's termios + settings have changed. New settings are in + tty->termios. Previous settings should be passed in + the "old" argument. + + The API is defined such that the driver should return + the actual modes selected. This means that the + driver function is responsible for modifying any + bits in the request it cannot fulfill to indicate + the actual modes being used. A device with no + hardware capability for change (e.g. a USB dongle or + virtual port) can provide NULL for this method. + +throttle() Notify the tty driver that input buffers for the + line discipline are close to full, and it should + somehow signal that no more characters should be + sent to the tty. + +unthrottle() Notify the tty driver that characters can now be + sent to the tty without fear of overrunning the + input buffers of the line disciplines. + +stop() Ask the tty driver to stop outputting characters + to the tty device. + +start() Ask the tty driver to resume sending characters + to the tty device. + +hangup() Ask the tty driver to hang up the tty device. + +break_ctl() (Optional) Ask the tty driver to turn on or off + BREAK status on the RS-232 port. If state is -1, + then the BREAK status should be turned on; if + state is 0, then BREAK should be turned off. + If this routine is not implemented, use ioctls + TIOCSBRK / TIOCCBRK instead. + +wait_until_sent() Waits until the device has written out all of the + characters in its transmitter FIFO. + +send_xchar() Send a high-priority XON/XOFF character to the device. +======================= ======================================================= + + +Flags +^^^^^ + +Line discipline methods have access to tty->flags field containing the +following interesting flags: + +======================= ======================================================= +TTY_THROTTLED Driver input is throttled. The ldisc should call + tty->driver->unthrottle() in order to resume + reception when it is ready to process more data. + +TTY_DO_WRITE_WAKEUP If set, causes the driver to call the ldisc's + write_wakeup() method in order to resume + transmission when it can accept more data + to transmit. + +TTY_IO_ERROR If set, causes all subsequent userspace read/write + calls on the tty to fail, returning -EIO. + +TTY_OTHER_CLOSED Device is a pty and the other side has closed. + +TTY_NO_WRITE_SPLIT Prevent driver from splitting up writes into + smaller chunks. +======================= ======================================================= + + +Locking +^^^^^^^ + +Callers to the line discipline functions from the tty layer are required to +take line discipline locks. The same is true of calls from the driver side +but not yet enforced. + +Three calls are now provided:: + + ldisc = tty_ldisc_ref(tty); + +takes a handle to the line discipline in the tty and returns it. If no ldisc +is currently attached or the ldisc is being closed and re-opened at this +point then NULL is returned. While this handle is held the ldisc will not +change or go away:: + + tty_ldisc_deref(ldisc) + +Returns the ldisc reference and allows the ldisc to be closed. Returning the +reference takes away your right to call the ldisc functions until you take +a new reference:: + + ldisc = tty_ldisc_ref_wait(tty); + +Performs the same function as tty_ldisc_ref except that it will wait for an +ldisc change to complete and then return a reference to the new ldisc. + +While these functions are slightly slower than the old code they should have +minimal impact as most receive logic uses the flip buffers and they only +need to take a reference when they push bits up through the driver. + +A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc +functions are called with the ldisc unavailable. Thus tty_ldisc_ref will +fail in this situation if used within these functions. Ldisc and driver +code calling its own functions must be careful in this case. + + +Driver Interface +---------------- + +======================= ======================================================= +open() Called when a device is opened. May sleep + +close() Called when a device is closed. At the point of + return from this call the driver must make no + further ldisc calls of any kind. May sleep + +write() Called to write bytes to the device. May not + sleep. May occur in parallel in special cases. + Because this includes panic paths drivers generally + shouldn't try and do clever locking here. + +put_char() Stuff a single character onto the queue. The + driver is guaranteed following up calls to + flush_chars. + +flush_chars() Ask the kernel to write put_char queue + +write_room() Return the number of characters that can be stuffed + into the port buffers without overflow (or less). + The ldisc is responsible for being intelligent + about multi-threading of write_room/write calls + +ioctl() Called when an ioctl may be for the driver + +set_termios() Called on termios change, serialized against + itself by a semaphore. May sleep. + +set_ldisc() Notifier for discipline change. At the point this + is done the discipline is not yet usable. Can now + sleep (I think) + +throttle() Called by the ldisc to ask the driver to do flow + control. Serialization including with unthrottle + is the job of the ldisc layer. + +unthrottle() Called by the ldisc to ask the driver to stop flow + control. + +stop() Ldisc notifier to the driver to stop output. As with + throttle the serializations with start() are down + to the ldisc layer. + +start() Ldisc notifier to the driver to start output. + +hangup() Ask the tty driver to cause a hangup initiated + from the host side. [Can sleep ??] + +break_ctl() Send RS232 break. Can sleep. Can get called in + parallel, driver must serialize (for now), and + with write calls. + +wait_until_sent() Wait for characters to exit the hardware queue + of the driver. Can sleep + +send_xchar() Send XON/XOFF and if possible jump the queue with + it in order to get fast flow control responses. + Cannot sleep ?? +======================= ======================================================= diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/serial/cyclades_z.rst deleted file mode 100644 index 532ff67e2f1c..000000000000 --- a/Documentation/serial/cyclades_z.rst +++ /dev/null @@ -1,11 +0,0 @@ -================ -Cyclades-Z notes -================ - -The Cyclades-Z must have firmware loaded onto the card before it will -operate. This operation should be performed during system startup, - -The firmware, loader program and the latest device driver code are -available from Cyclades at - - ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/ diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst deleted file mode 100644 index 4537119bf624..000000000000 --- a/Documentation/serial/driver.rst +++ /dev/null @@ -1,549 +0,0 @@ -==================== -Low Level Serial API -==================== - - -This document is meant as a brief overview of some aspects of the new serial -driver. It is not complete, any questions you have should be directed to - - -The reference implementation is contained within amba-pl011.c. - - - -Low Level Serial Hardware Driver --------------------------------- - -The low level serial hardware driver is responsible for supplying port -information (defined by uart_port) and a set of control methods (defined -by uart_ops) to the core serial driver. The low level driver is also -responsible for handling interrupts for the port, and providing any -console support. - - -Console Support ---------------- - -The serial core provides a few helper functions. This includes identifing -the correct port structure (via uart_get_console) and decoding command line -arguments (uart_parse_options). - -There is also a helper function (uart_console_write) which performs a -character by character write, translating newlines to CRLF sequences. -Driver writers are recommended to use this function rather than implementing -their own version. - - -Locking -------- - -It is the responsibility of the low level hardware driver to perform the -necessary locking using port->lock. There are some exceptions (which -are described in the uart_ops listing below.) - -There are two locks. A per-port spinlock, and an overall semaphore. - -From the core driver perspective, the port->lock locks the following -data:: - - port->mctrl - port->icount - port->state->xmit.head (circ_buf->head) - port->state->xmit.tail (circ_buf->tail) - -The low level driver is free to use this lock to provide any additional -locking. - -The port_sem semaphore is used to protect against ports being added/ -removed or reconfigured at inappropriate times. Since v2.6.27, this -semaphore has been the 'mutex' member of the tty_port struct, and -commonly referred to as the port mutex. - - -uart_ops --------- - -The uart_ops structure is the main interface between serial_core and the -hardware specific driver. It contains all the methods to control the -hardware. - - tx_empty(port) - This function tests whether the transmitter fifo and shifter - for the port described by 'port' is empty. If it is empty, - this function should return TIOCSER_TEMT, otherwise return 0. - If the port does not support this operation, then it should - return TIOCSER_TEMT. - - Locking: none. - - Interrupts: caller dependent. - - This call must not sleep - - set_mctrl(port, mctrl) - This function sets the modem control lines for port described - by 'port' to the state described by mctrl. The relevant bits - of mctrl are: - - - TIOCM_RTS RTS signal. - - TIOCM_DTR DTR signal. - - TIOCM_OUT1 OUT1 signal. - - TIOCM_OUT2 OUT2 signal. - - TIOCM_LOOP Set the port into loopback mode. - - If the appropriate bit is set, the signal should be driven - active. If the bit is clear, the signal should be driven - inactive. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - get_mctrl(port) - Returns the current state of modem control inputs. The state - of the outputs should not be returned, since the core keeps - track of their state. The state information should include: - - - TIOCM_CAR state of DCD signal - - TIOCM_CTS state of CTS signal - - TIOCM_DSR state of DSR signal - - TIOCM_RI state of RI signal - - The bit is set if the signal is currently driven active. If - the port does not support CTS, DCD or DSR, the driver should - indicate that the signal is permanently active. If RI is - not available, the signal should not be indicated as active. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - stop_tx(port) - Stop transmitting characters. This might be due to the CTS - line becoming inactive or the tty layer indicating we want - to stop transmission due to an XOFF character. - - The driver should stop transmitting characters as soon as - possible. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - start_tx(port) - Start transmitting characters. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - throttle(port) - Notify the serial driver that input buffers for the line discipline are - close to full, and it should somehow signal that no more characters - should be sent to the serial port. - This will be called only if hardware assisted flow control is enabled. - - Locking: serialized with .unthrottle() and termios modification by the - tty layer. - - unthrottle(port) - Notify the serial driver that characters can now be sent to the serial - port without fear of overrunning the input buffers of the line - disciplines. - - This will be called only if hardware assisted flow control is enabled. - - Locking: serialized with .throttle() and termios modification by the - tty layer. - - send_xchar(port,ch) - Transmit a high priority character, even if the port is stopped. - This is used to implement XON/XOFF flow control and tcflow(). If - the serial driver does not implement this function, the tty core - will append the character to the circular buffer and then call - start_tx() / stop_tx() to flush the data out. - - Do not transmit if ch == '\0' (__DISABLED_CHAR). - - Locking: none. - - Interrupts: caller dependent. - - stop_rx(port) - Stop receiving characters; the port is in the process of - being closed. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - enable_ms(port) - Enable the modem status interrupts. - - This method may be called multiple times. Modem status - interrupts should be disabled when the shutdown method is - called. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - break_ctl(port,ctl) - Control the transmission of a break signal. If ctl is - nonzero, the break signal should be transmitted. The signal - should be terminated when another call is made with a zero - ctl. - - Locking: caller holds tty_port->mutex - - startup(port) - Grab any interrupt resources and initialise any low level driver - state. Enable the port for reception. It should not activate - RTS nor DTR; this will be done via a separate call to set_mctrl. - - This method will only be called when the port is initially opened. - - Locking: port_sem taken. - - Interrupts: globally disabled. - - shutdown(port) - Disable the port, disable any break condition that may be in - effect, and free any interrupt resources. It should not disable - RTS nor DTR; this will have already been done via a separate - call to set_mctrl. - - Drivers must not access port->state once this call has completed. - - This method will only be called when there are no more users of - this port. - - Locking: port_sem taken. - - Interrupts: caller dependent. - - flush_buffer(port) - Flush any write buffers, reset any DMA state and stop any - ongoing DMA transfers. - - This will be called whenever the port->state->xmit circular - buffer is cleared. - - Locking: port->lock taken. - - Interrupts: locally disabled. - - This call must not sleep - - set_termios(port,termios,oldtermios) - Change the port parameters, including word length, parity, stop - bits. Update read_status_mask and ignore_status_mask to indicate - the types of events we are interested in receiving. Relevant - termios->c_cflag bits are: - - CSIZE - - word size - CSTOPB - - 2 stop bits - PARENB - - parity enable - PARODD - - odd parity (when PARENB is in force) - CREAD - - enable reception of characters (if not set, - still receive characters from the port, but - throw them away. - CRTSCTS - - if set, enable CTS status change reporting - CLOCAL - - if not set, enable modem status change - reporting. - - Relevant termios->c_iflag bits are: - - INPCK - - enable frame and parity error events to be - passed to the TTY layer. - BRKINT / PARMRK - - both of these enable break events to be - passed to the TTY layer. - - IGNPAR - - ignore parity and framing errors - IGNBRK - - ignore break errors, If IGNPAR is also - set, ignore overrun errors as well. - - The interaction of the iflag bits is as follows (parity error - given as an example): - - =============== ======= ====== ============================= - Parity error INPCK IGNPAR - =============== ======= ====== ============================= - n/a 0 n/a character received, marked as - TTY_NORMAL - None 1 n/a character received, marked as - TTY_NORMAL - Yes 1 0 character received, marked as - TTY_PARITY - Yes 1 1 character discarded - =============== ======= ====== ============================= - - Other flags may be used (eg, xon/xoff characters) if your - hardware supports hardware "soft" flow control. - - Locking: caller holds tty_port->mutex - - Interrupts: caller dependent. - - This call must not sleep - - set_ldisc(port,termios) - Notifier for discipline change. See Documentation/serial/tty.rst. - - Locking: caller holds tty_port->mutex - - pm(port,state,oldstate) - Perform any power management related activities on the specified - port. State indicates the new state (defined by - enum uart_pm_state), oldstate indicates the previous state. - - This function should not be used to grab any resources. - - This will be called when the port is initially opened and finally - closed, except when the port is also the system console. This - will occur even if CONFIG_PM is not set. - - Locking: none. - - Interrupts: caller dependent. - - type(port) - Return a pointer to a string constant describing the specified - port, or return NULL, in which case the string 'unknown' is - substituted. - - Locking: none. - - Interrupts: caller dependent. - - release_port(port) - Release any memory and IO region resources currently in use by - the port. - - Locking: none. - - Interrupts: caller dependent. - - request_port(port) - Request any memory and IO region resources required by the port. - If any fail, no resources should be registered when this function - returns, and it should return -EBUSY on failure. - - Locking: none. - - Interrupts: caller dependent. - - config_port(port,type) - Perform any autoconfiguration steps required for the port. `type` - contains a bit mask of the required configuration. UART_CONFIG_TYPE - indicates that the port requires detection and identification. - port->type should be set to the type found, or PORT_UNKNOWN if - no port was detected. - - UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal, - which should be probed using standard kernel autoprobing techniques. - This is not necessary on platforms where ports have interrupts - internally hard wired (eg, system on a chip implementations). - - Locking: none. - - Interrupts: caller dependent. - - verify_port(port,serinfo) - Verify the new serial port information contained within serinfo is - suitable for this port type. - - Locking: none. - - Interrupts: caller dependent. - - ioctl(port,cmd,arg) - Perform any port specific IOCTLs. IOCTL commands must be defined - using the standard numbering system found in - - Locking: none. - - Interrupts: caller dependent. - - poll_init(port) - Called by kgdb to perform the minimal hardware initialization needed - to support poll_put_char() and poll_get_char(). Unlike ->startup() - this should not request interrupts. - - Locking: tty_mutex and tty_port->mutex taken. - - Interrupts: n/a. - - poll_put_char(port,ch) - Called by kgdb to write a single character directly to the serial - port. It can and should block until there is space in the TX FIFO. - - Locking: none. - - Interrupts: caller dependent. - - This call must not sleep - - poll_get_char(port) - Called by kgdb to read a single character directly from the serial - port. If data is available, it should be returned; otherwise - the function should return NO_POLL_CHAR immediately. - - Locking: none. - - Interrupts: caller dependent. - - This call must not sleep - -Other functions ---------------- - -uart_update_timeout(port,cflag,baud) - Update the FIFO drain timeout, port->timeout, according to the - number of bits, parity, stop bits and baud rate. - - Locking: caller is expected to take port->lock - - Interrupts: n/a - -uart_get_baud_rate(port,termios,old,min,max) - Return the numeric baud rate for the specified termios, taking - account of the special 38400 baud "kludge". The B0 baud rate - is mapped to 9600 baud. - - If the baud rate is not within min..max, then if old is non-NULL, - the original baud rate will be tried. If that exceeds the - min..max constraint, 9600 baud will be returned. termios will - be updated to the baud rate in use. - - Note: min..max must always allow 9600 baud to be selected. - - Locking: caller dependent. - - Interrupts: n/a - -uart_get_divisor(port,baud) - Return the divisor (baud_base / baud) for the specified baud - rate, appropriately rounded. - - If 38400 baud and custom divisor is selected, return the - custom divisor instead. - - Locking: caller dependent. - - Interrupts: n/a - -uart_match_port(port1,port2) - This utility function can be used to determine whether two - uart_port structures describe the same port. - - Locking: n/a - - Interrupts: n/a - -uart_write_wakeup(port) - A driver is expected to call this function when the number of - characters in the transmit buffer have dropped below a threshold. - - Locking: port->lock should be held. - - Interrupts: n/a - -uart_register_driver(drv) - Register a uart driver with the core driver. We in turn register - with the tty layer, and initialise the core driver per-port state. - - drv->port should be NULL, and the per-port structures should be - registered using uart_add_one_port after this call has succeeded. - - Locking: none - - Interrupts: enabled - -uart_unregister_driver() - Remove all references to a driver from the core driver. The low - level driver must have removed all its ports via the - uart_remove_one_port() if it registered them with uart_add_one_port(). - - Locking: none - - Interrupts: enabled - -**uart_suspend_port()** - -**uart_resume_port()** - -**uart_add_one_port()** - -**uart_remove_one_port()** - -Other notes ------------ - -It is intended some day to drop the 'unused' entries from uart_port, and -allow low level drivers to register their own individual uart_port's with -the core. This will allow drivers to use uart_port as a pointer to a -structure containing both the uart_port entry with their own extensions, -thus:: - - struct my_port { - struct uart_port port; - int my_stuff; - }; - -Modem control lines via GPIO ----------------------------- - -Some helpers are provided in order to set/get modem control lines via GPIO. - -mctrl_gpio_init(port, idx): - This will get the {cts,rts,...}-gpios from device tree if they are - present and request them, set direction etc, and return an - allocated structure. `devm_*` functions are used, so there's no need - to call mctrl_gpio_free(). - As this sets up the irq handling make sure to not handle changes to the - gpio input lines in your driver, too. - -mctrl_gpio_free(dev, gpios): - This will free the requested gpios in mctrl_gpio_init(). - As `devm_*` functions are used, there's generally no need to call - this function. - -mctrl_gpio_to_gpiod(gpios, gidx) - This returns the gpio_desc structure associated to the modem line - index. - -mctrl_gpio_set(gpios, mctrl): - This will sets the gpios according to the mctrl state. - -mctrl_gpio_get(gpios, mctrl): - This will update mctrl with the gpios values. - -mctrl_gpio_enable_ms(gpios): - Enables irqs and handling of changes to the ms lines. - -mctrl_gpio_disable_ms(gpios): - Disables irqs and handling of changes to the ms lines. diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst deleted file mode 100644 index d0ba22ea23bf..000000000000 --- a/Documentation/serial/index.rst +++ /dev/null @@ -1,32 +0,0 @@ -:orphan: - -========================== -Support for Serial devices -========================== - -.. toctree:: - :maxdepth: 1 - - - driver - tty - -Serial drivers -============== - -.. toctree:: - :maxdepth: 1 - - cyclades_z - moxa-smartio - n_gsm - rocket - serial-iso7816 - serial-rs485 - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/serial/moxa-smartio.rst deleted file mode 100644 index 156100f17c3f..000000000000 --- a/Documentation/serial/moxa-smartio.rst +++ /dev/null @@ -1,615 +0,0 @@ -============================================================= -MOXA Smartio/Industio Family Device Driver Installation Guide -============================================================= - -.. note:: - - This file is outdated. It needs some care in order to make it - updated to Kernel 5.0 and upper - -Copyright (C) 2008, Moxa Inc. - -Date: 01/21/2008 - -.. Content - - 1. Introduction - 2. System Requirement - 3. Installation - 3.1 Hardware installation - 3.2 Driver files - 3.3 Device naming convention - 3.4 Module driver configuration - 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x. - 3.6 Custom configuration - 3.7 Verify driver installation - 4. Utilities - 5. Setserial - 6. Troubleshooting - -1. Introduction -^^^^^^^^^^^^^^^ - - The Smartio/Industio/UPCI family Linux driver supports following multiport - boards. - - - 2 ports multiport board - CP-102U, CP-102UL, CP-102UF - CP-132U-I, CP-132UL, - CP-132, CP-132I, CP132S, CP-132IS, - CI-132, CI-132I, CI-132IS, - (C102H, C102HI, C102HIS, C102P, CP-102, CP-102S) - - - 4 ports multiport board - CP-104EL, - CP-104UL, CP-104JU, - CP-134U, CP-134U-I, - C104H/PCI, C104HS/PCI, - CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL, - C104H, C104HS, - CI-104J, CI-104JS, - CI-134, CI-134I, CI-134IS, - (C114HI, CT-114I, C104P), - POS-104UL, - CB-114, - CB-134I - - - 8 ports multiport board - CP-118EL, CP-168EL, - CP-118U, CP-168U, - C168H/PCI, - C168H, C168HS, - (C168P), - CB-108 - - This driver and installation procedure have been developed upon Linux Kernel - 2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order - to maintain compatibility, this version has also been properly tested with - RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem - occurs, please contact Moxa at support@moxa.com.tw. - - In addition to device driver, useful utilities are also provided in this - version. They are: - - - msdiag - Diagnostic program for displaying installed Moxa - Smartio/Industio boards. - - msmon - Monitor program to observe data count and line status signals. - - msterm A simple terminal program which is useful in testing serial - ports. - - io-irq.exe - Configuration program to setup ISA boards. Please note that - this program can only be executed under DOS. - - All the drivers and utilities are published in form of source code under - GNU General Public License in this version. Please refer to GNU General - Public License announcement in each source code file for more detail. - - In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/. - - This version of driver can be installed as Loadable Module (Module driver) - or built-in into kernel (Static driver). You may refer to following - installation procedure for suitable one. Before you install the driver, - please refer to hardware installation procedure in the User's Manual. - - We assume the user should be familiar with following documents. - - - Serial-HOWTO - - Kernel-HOWTO - -2. System Requirement -^^^^^^^^^^^^^^^^^^^^^ - - - Hardware platform: Intel x86 machine - - Kernel version: 2.4.x or 2.6.x - - gcc version 2.72 or later - - Maximum 4 boards can be installed in combination - -3. Installation -^^^^^^^^^^^^^^^ - -3.1 Hardware installation -========================= - - There are two types of buses, ISA and PCI, for Smartio/Industio - family multiport board. - -ISA board ---------- - - You'll have to configure CAP address, I/O address, Interrupt Vector - as well as IRQ before installing this driver. Please refer to hardware - installation procedure in User's Manual before proceed any further. - Please make sure the JP1 is open after the ISA board is set properly. - -PCI/UPCI board --------------- - - You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict - with other ISA devices. Please refer to hardware installation - procedure in User's Manual in advance. - -PCI IRQ Sharing ---------------- - - Each port within the same multiport board shares the same IRQ. Up to - 4 Moxa Smartio/Industio PCI Family multiport boards can be installed - together on one system and they can share the same IRQ. - - -3.2 Driver files -================ - - The driver file may be obtained from ftp, CD-ROM or floppy disk. The - first step, anyway, is to copy driver file "mxser.tgz" into specified - directory. e.g. /moxa. The execute commands as below:: - - # cd / - # mkdir moxa - # cd /moxa - # tar xvf /dev/fd0 - -or:: - - # cd / - # mkdir moxa - # cd /moxa - # cp /mnt/cdrom//mxser.tgz . - # tar xvfz mxser.tgz - - -3.3 Device naming convention -============================ - - You may find all the driver and utilities files in /moxa/mxser. - Following installation procedure depends on the model you'd like to - run the driver. If you prefer module driver, please refer to 3.4. - If static driver is required, please refer to 3.5. - -Dialin and callout port ------------------------ - - This driver remains traditional serial device properties. There are - two special file name for each serial port. One is dial-in port - which is named "ttyMxx". For callout port, the naming convention - is "cumxx". - -Device naming when more than 2 boards installed ------------------------------------------------ - - Naming convention for each Smartio/Industio multiport board is - pre-defined as below. - - ============ =============== ============== - Board Num. Dial-in Port Callout port - 1st board ttyM0 - ttyM7 cum0 - cum7 - 2nd board ttyM8 - ttyM15 cum8 - cum15 - 3rd board ttyM16 - ttyM23 cum16 - cum23 - 4th board ttyM24 - ttym31 cum24 - cum31 - ============ =============== ============== - -.. note:: - - Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM* - device instead. - -Board sequence --------------- - - This driver will activate ISA boards according to the parameter set - in the driver. After all specified ISA board activated, PCI board - will be installed in the system automatically driven. - Therefore the board number is sorted by the CAP address of ISA boards. - For PCI boards, their sequence will be after ISA boards and C168H/PCI - has higher priority than C104H/PCI boards. - -3.4 Module driver configuration -=============================== - - Module driver is easiest way to install. If you prefer static driver - installation, please skip this paragraph. - - - ------------- Prepare to use the MOXA driver -------------------- - -3.4.1 Create tty device with correct major number -------------------------------------------------- - - Before using MOXA driver, your system must have the tty devices - which are created with driver's major number. We offer one shell - script "msmknod" to simplify the procedure. - This step is only needed to be executed once. But you still - need to do this procedure when: - - a. You change the driver's major number. Please refer the "3.7" - section. - b. Your total installed MOXA boards number is changed. Maybe you - add/delete one MOXA board. - c. You want to change the tty name. This needs to modify the - shell script "msmknod" - - The procedure is:: - - # cd /moxa/mxser/driver - # ./msmknod - - This shell script will require the major number for dial-in - device and callout device to create tty device. You also need - to specify the total installed MOXA board number. Default major - numbers for dial-in device and callout device are 30, 35. If - you need to change to other number, please refer section "3.7" - for more detailed procedure. - Msmknod will delete any special files occupying the same device - naming. - -3.4.2 Build the MOXA driver and utilities ------------------------------------------ - - Before using the MOXA driver and utilities, you need compile the - all the source code. This step is only need to be executed once. - But you still re-compile the source code if you modify the source - code. For example, if you change the driver's major number (see - "3.7" section), then you need to do this step again. - - Find "Makefile" in /moxa/mxser, then run - - # make clean; make install - - ..note:: - - For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1: - # make clean; make installsp1 - - For Red Hat Enterprise Linux AS4/ES4/WS4: - # make clean; make installsp2 - - The driver files "mxser.o" and utilities will be properly compiled - and copied to system directories respectively. - -------------- Load MOXA driver-------------------- - -3.4.3 Load the MOXA driver --------------------------- - - :: - - # modprobe mxser - - will activate the module driver. You may run "lsmod" to check - if "mxser" is activated. If the MOXA board is ISA board, the - is needed. Please refer to section "3.4.5" for more - information. - -------------- Load MOXA driver on boot -------------------- - -3.4.4 Load the mxser driver ---------------------------- - - - For the above description, you may manually execute - "modprobe mxser" to activate this driver and run - "rmmod mxser" to remove it. - - However, it's better to have a boot time configuration to - eliminate manual operation. Boot time configuration can be - achieved by rc file. We offer one "rc.mxser" file to simplify - the procedure under "moxa/mxser/driver". - - But if you use ISA board, please modify the "modprobe ..." command - to add the argument (see "3.4.5" section). After modifying the - rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser" - manually to make sure the modification is ok. If any error - encountered, please try to modify again. If the modification is - completed, follow the below step. - - Run following command for setting rc files:: - - # cd /moxa/mxser/driver - # cp ./rc.mxser /etc/rc.d - # cd /etc/rc.d - - Check "rc.serial" is existed or not. If "rc.serial" doesn't exist, - create it by vi, run "chmod 755 rc.serial" to change the permission. - - Add "/etc/rc.d/rc.mxser" in last line. - - Reboot and check if moxa.o activated by "lsmod" command. - -3.4.5. specify CAP address --------------------------- - - If you'd like to drive Smartio/Industio ISA boards in the system, - you'll have to add parameter to specify CAP address of given - board while activating "mxser.o". The format for parameters are - as follows.:: - - modprobe mxser ioaddr=0x???,0x???,0x???,0x??? - | | | | - | | | +- 4th ISA board - | | +------ 3rd ISA board - | +------------ 2nd ISA board - +-------------------1st ISA board - -3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x -================================================================ - - Note: - To use static driver, you must install the linux kernel - source package. - -3.5.1 Backup the built-in driver in the kernel ----------------------------------------------- - - :: - - # cd /usr/src/linux/drivers/char - # mv mxser.c mxser.c.old - - For Red Hat 7.x user, you need to create link: - # cd /usr/src - # ln -s linux-2.4 linux - -3.5.2 Create link ------------------ - :: - - # cd /usr/src/linux/drivers/char - # ln -s /moxa/mxser/driver/mxser.c mxser.c - -3.5.3 Add CAP address list for ISA boards. ------------------------------------------- - - For PCI boards user, please skip this step. - - In module mode, the CAP address for ISA board is given by - parameter. In static driver configuration, you'll have to - assign it within driver's source code. If you will not - install any ISA boards, you may skip to next portion. - The instructions to modify driver source code are as - below. - - a. run:: - - # cd /moxa/mxser/driver - # vi mxser.c - - b. Find the array mxserBoardCAP[] as below:: - - static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00}; - - c. Change the address within this array using vi. For - example, to driver 2 ISA boards with CAP address - 0x280 and 0x180 as 1st and 2nd board. Just to change - the source code as follows:: - - static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00}; - -3.5.4 Setup kernel configuration --------------------------------- - - Configure the kernel:: - - # cd /usr/src/linux - # make menuconfig - - You will go into a menu-driven system. Please select [Character - devices][Non-standard serial port support], enable the [Moxa - SmartIO support] driver with "[*]" for built-in (not "[M]"), then - select [Exit] to exit this program. - -3.5.5 Rebuild kernel --------------------- - - The following are for Linux kernel rebuilding, for your - reference only. - - For appropriate details, please refer to the Linux document: - - a. Run the following commands:: - - cd /usr/src/linux - make clean # take a few minutes - make dep # take a few minutes - make bzImage # take probably 10-20 minutes - make install # copy boot image to correct position - - f. Please make sure the boot kernel (vmlinuz) is in the - correct position. - g. If you use 'lilo' utility, you should check /etc/lilo.conf - 'image' item specified the path which is the 'vmlinuz' path, - or you will load wrong (or old) boot kernel image (vmlinuz). - After checking /etc/lilo.conf, please run "lilo". - - Note that if the result of "make bzImage" is ERROR, then you have to - go back to Linux configuration Setup. Type "make menuconfig" in - directory /usr/src/linux. - - -3.5.6 Make tty device and special file --------------------------------------- - - :: - # cd /moxa/mxser/driver - # ./msmknod - -3.5.7 Make utility ------------------- - - :: - - # cd /moxa/mxser/utility - # make clean; make install - -3.5.8 Reboot ------------- - - - -3.6 Custom configuration -======================== - - Although this driver already provides you default configuration, you - still can change the device name and major number. The instruction to - change these parameters are shown as below. - -a. Change Device name - - If you'd like to use other device names instead of default naming - convention, all you have to do is to modify the internal code - within the shell script "msmknod". First, you have to open "msmknod" - by vi. Locate each line contains "ttyM" and "cum" and change them - to the device name you desired. "msmknod" creates the device names - you need next time executed. - -b. Change Major number - - If major number 30 and 35 had been occupied, you may have to select - 2 free major numbers for this driver. There are 3 steps to change - major numbers. - -3.6.1 Find free major numbers ------------------------------ - - In /proc/devices, you may find all the major numbers occupied - in the system. Please select 2 major numbers that are available. - e.g. 40, 45. - -3.6.2 Create special files --------------------------- - - Run /moxa/mxser/driver/msmknod to create special files with - specified major numbers. - -3.6.3 Modify driver with new major number ------------------------------------------ - - Run vi to open /moxa/mxser/driver/mxser.c. Locate the line - contains "MXSERMAJOR". Change the content as below:: - - #define MXSERMAJOR 40 - #define MXSERCUMAJOR 45 - - 3.6.4 Run "make clean; make install" in /moxa/mxser/driver. - -3.7 Verify driver installation -============================== - - You may refer to /var/log/messages to check the latest status - log reported by this driver whenever it's activated. - -4. Utilities -^^^^^^^^^^^^ - - There are 3 utilities contained in this driver. They are msdiag, msmon and - msterm. These 3 utilities are released in form of source code. They should - be compiled into executable file and copied into /usr/bin. - - Before using these utilities, please load driver (refer 3.4 & 3.5) and - make sure you had run the "msmknod" utility. - -msdiag - Diagnostic -=================== - - This utility provides the function to display what Moxa Smartio/Industio - board found by driver in the system. - -msmon - Port Monitoring -======================= - - This utility gives the user a quick view about all the MOXA ports' - activities. One can easily learn each port's total received/transmitted - (Rx/Tx) character count since the time when the monitoring is started. - - Rx/Tx throughputs per second are also reported in interval basis (e.g. - the last 5 seconds) and in average basis (since the time the monitoring - is started). You can reset all ports' count by key. <+> <-> - (plus/minus) keys to change the displaying time interval. Press - on the port, that cursor stay, to view the port's communication - parameters, signal status, and input/output queue. - -msterm - Terminal Emulation -=========================== - - This utility provides data sending and receiving ability of all tty ports, - especially for MOXA ports. It is quite useful for testing simple - application, for example, sending AT command to a modem connected to the - port or used as a terminal for login purpose. Note that this is only a - dumb terminal emulation without handling full screen operation. - -5. Setserial -^^^^^^^^^^^^ - - Supported Setserial parameters are listed as below. - - ============== ========================================================= - uart set UART type(16450-->disable FIFO, 16550A-->enable FIFO) - close_delay set the amount of time(in 1/100 of a second) that DTR - should be kept low while being closed. - closing_wait set the amount of time(in 1/100 of a second) that the - serial port should wait for data to be drained while - being closed, before the receiver is disable. - spd_hi Use 57.6kb when the application requests 38.4kb. - spd_vhi Use 115.2kb when the application requests 38.4kb. - spd_shi Use 230.4kb when the application requests 38.4kb. - spd_warp Use 460.8kb when the application requests 38.4kb. - spd_normal Use 38.4kb when the application requests 38.4kb. - spd_cust Use the custom divisor to set the speed when the - application requests 38.4kb. - divisor This option set the custom division. - baud_base This option set the base baud rate. - ============== ========================================================= - -6. Troubleshooting -^^^^^^^^^^^^^^^^^^ - - The boot time error messages and solutions are stated as clearly as - possible. If all the possible solutions fail, please contact our technical - support team to get more help. - - - Error msg: - More than 4 Moxa Smartio/Industio family boards found. Fifth board - and after are ignored. - - Solution: - To avoid this problem, please unplug fifth and after board, because Moxa - driver supports up to 4 boards. - - Error msg: - Request_irq fail, IRQ(?) may be conflict with another device. - - Solution: - Other PCI or ISA devices occupy the assigned IRQ. If you are not sure - which device causes the situation, please check /proc/interrupts to find - free IRQ and simply change another free IRQ for Moxa board. - - Error msg: - Board #: C1xx Series(CAP=xxx) interrupt number invalid. - - Solution: - Each port within the same multiport board shares the same IRQ. Please set - one IRQ (IRQ doesn't equal to zero) for one Moxa board. - - Error msg: - No interrupt vector be set for Moxa ISA board(CAP=xxx). - - Solution: - Moxa ISA board needs an interrupt vector.Please refer to user's manual - "Hardware Installation" chapter to set interrupt vector. - - Error msg: - Couldn't install MOXA Smartio/Industio family driver! - - Solution: - Load Moxa driver fail, the major number may conflict with other devices. - Please refer to previous section 3.7 to change a free major number for - Moxa driver. - - Error msg: - Couldn't install MOXA Smartio/Industio family callout driver! - - Solution: - Load Moxa callout driver fail, the callout device major number may - conflict with other devices. Please refer to previous section 3.7 to - change a free callout device major number for Moxa driver. diff --git a/Documentation/serial/n_gsm.rst b/Documentation/serial/n_gsm.rst deleted file mode 100644 index f3ad9fd26408..000000000000 --- a/Documentation/serial/n_gsm.rst +++ /dev/null @@ -1,103 +0,0 @@ -============================== -GSM 0710 tty multiplexor HOWTO -============================== - -This line discipline implements the GSM 07.10 multiplexing protocol -detailed in the following 3GPP document: - - http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip - -This document give some hints on how to use this driver with GPRS and 3G -modems connected to a physical serial port. - -How to use it -------------- -1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through - its serial port. Depending on the modem used, you can pass more or less - parameters to this command, -2. switch the serial line to using the n_gsm line discipline by using - TIOCSETD ioctl, -3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl, - -Major parts of the initialization program : -(a good starting point is util-linux-ng/sys-utils/ldattach.c):: - - #include - #define N_GSM0710 21 /* GSM 0710 Mux */ - #define DEFAULT_SPEED B115200 - #define SERIAL_PORT /dev/ttyS0 - - int ldisc = N_GSM0710; - struct gsm_config c; - struct termios configuration; - - /* open the serial port connected to the modem */ - fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY); - - /* configure the serial port : speed, flow control ... */ - - /* send the AT commands to switch the modem to CMUX mode - and check that it's successful (should return OK) */ - write(fd, "AT+CMUX=0\r", 10); - - /* experience showed that some modems need some time before - being able to answer to the first MUX packet so a delay - may be needed here in some case */ - sleep(3); - - /* use n_gsm line discipline */ - ioctl(fd, TIOCSETD, &ldisc); - - /* get n_gsm configuration */ - ioctl(fd, GSMIOC_GETCONF, &c); - /* we are initiator and need encoding 0 (basic) */ - c.initiator = 1; - c.encapsulation = 0; - /* our modem defaults to a maximum size of 127 bytes */ - c.mru = 127; - c.mtu = 127; - /* set the new configuration */ - ioctl(fd, GSMIOC_SETCONF, &c); - - /* and wait for ever to keep the line discipline enabled */ - daemon(0,0); - pause(); - -4. create the devices corresponding to the "virtual" serial ports (take care, - each modem has its configuration and some DLC have dedicated functions, - for example GPS), starting with minor 1 (DLC0 is reserved for the management - of the mux):: - - MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}` - for i in `seq 1 4`; do - mknod /dev/ttygsm$i c $MAJOR $i - done - -5. use these devices as plain serial ports. - - for example, it's possible: - - - and to use gnokii to send / receive SMS on ttygsm1 - - to use ppp to establish a datalink on ttygsm2 - -6. first close all virtual ports before closing the physical port. - - Note that after closing the physical port the modem is still in multiplexing - mode. This may prevent a successful re-opening of the port later. To avoid - this situation either reset the modem if your hardware allows that or send - a disconnect command frame manually before initializing the multiplexing mode - for the second time. The byte sequence for the disconnect command frame is:: - - 0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9. - -Additional Documentation ------------------------- -More practical details on the protocol and how it's supported by industrial -modems can be found in the following documents : - -- http://www.telit.com/module/infopool/download.php?id=616 -- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf -- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx -- http://wm.sim.com/sim/News/photo/2010721161442.pdf - -11-03-08 - Eric Bénard - diff --git a/Documentation/serial/rocket.rst b/Documentation/serial/rocket.rst deleted file mode 100644 index 23761eae4282..000000000000 --- a/Documentation/serial/rocket.rst +++ /dev/null @@ -1,185 +0,0 @@ -================================================ -Comtrol(tm) RocketPort(R)/RocketModem(TM) Series -================================================ - -Device Driver for the Linux Operating System -============================================ - -Product overview ----------------- - -This driver provides a loadable kernel driver for the Comtrol RocketPort -and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32 -high-speed serial ports or modems. This driver supports up to a combination -of four RocketPort or RocketModems boards in one machine simultaneously. -This file assumes that you are using the RocketPort driver which is -integrated into the kernel sources. - -The driver can also be installed as an external module using the usual -"make;make install" routine. This external module driver, obtainable -from the Comtrol website listed below, is useful for updating the driver -or installing it into kernels which do not have the driver configured -into them. Installations instructions for the external module -are in the included README and HW_INSTALL files. - -RocketPort ISA and RocketModem II PCI boards currently are only supported by -this driver in module form. - -The RocketPort ISA board requires I/O ports to be configured by the DIP -switches on the board. See the section "ISA Rocketport Boards" below for -information on how to set the DIP switches. - -You pass the I/O port to the driver using the following module parameters: - -board1: - I/O port for the first ISA board -board2: - I/O port for the second ISA board -board3: - I/O port for the third ISA board -board4: - I/O port for the fourth ISA board - -There is a set of utilities and scripts provided with the external driver -(downloadable from http://www.comtrol.com) that ease the configuration and -setup of the ISA cards. - -The RocketModem II PCI boards require firmware to be loaded into the card -before it will function. The driver has only been tested as a module for this -board. - -Installation Procedures ------------------------ - -RocketPort/RocketModem PCI cards require no driver configuration, they are -automatically detected and configured. - -The RocketPort driver can be installed as a module (recommended) or built -into the kernel. This is selected, as for other drivers, through the `make config` -command from the root of the Linux source tree during the kernel build process. - -The RocketPort/RocketModem serial ports installed by this driver are assigned -device major number 46, and will be named /dev/ttyRx, where x is the port number -starting at zero (ex. /dev/ttyR0, /devttyR1, ...). If you have multiple cards -installed in the system, the mapping of port names to serial ports is displayed -in the system log at /var/log/messages. - -If installed as a module, the module must be loaded. This can be done -manually by entering "modprobe rocket". To have the module loaded automatically -upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line -"alias char-major-46 rocket". - -In order to use the ports, their device names (nodes) must be created with mknod. -This is only required once, the system will retain the names once created. To -create the RocketPort/RocketModem device names, use the command -"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero. - -For example:: - - > mknod /dev/ttyR0 c 46 0 - > mknod /dev/ttyR1 c 46 1 - > mknod /dev/ttyR2 c 46 2 - -The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes) -for you:: - - >/dev/MAKEDEV ttyR - -ISA Rocketport Boards ---------------------- - -You must assign and configure the I/O addresses used by the ISA Rocketport -card before installing and using it. This is done by setting a set of DIP -switches on the Rocketport board. - - -Setting the I/O address ------------------------ - -Before installing RocketPort(R) or RocketPort RA boards, you must find -a range of I/O addresses for it to use. The first RocketPort card -requires a 68-byte contiguous block of I/O addresses, starting at one -of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h, -0x300h, 0x340h, 0x380h. This I/O address must be reflected in the DIP -switches of *all* of the Rocketport cards. - -The second, third, and fourth RocketPort cards require a 64-byte -contiguous block of I/O addresses, starting at one of the following -I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h, -0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h. The I/O address used by the -second, third, and fourth Rocketport cards (if present) are set via -software control. The DIP switch settings for the I/O address must be -set to the value of the first Rocketport cards. - -In order to distinguish each of the card from the others, each card -must have a unique board ID set on the dip switches. The first -Rocketport board must be set with the DIP switches corresponding to -the first board, the second board must be set with the DIP switches -corresponding to the second board, etc. IMPORTANT: The board ID is -the only place where the DIP switch settings should differ between the -various Rocketport boards in a system. - -The I/O address range used by any of the RocketPort cards must not -conflict with any other cards in the system, including other -RocketPort cards. Below, you will find a list of commonly used I/O -address ranges which may be in use by other devices in your system. -On a Linux system, "cat /proc/ioports" will also be helpful in -identifying what I/O addresses are being used by devices on your -system. - -Remember, the FIRST RocketPort uses 68 I/O addresses. So, if you set it -for 0x100, it will occupy 0x100 to 0x143. This would mean that you -CAN NOT set the second, third or fourth board for address 0x140 since -the first 4 bytes of that range are used by the first board. You would -need to set the second, third, or fourth board to one of the next available -blocks such as 0x180. - -RocketPort and RocketPort RA SW1 Settings:: - - +-------------------------------+ - | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - +-------+-------+---------------+ - | Unused| Card | I/O Port Block| - +-------------------------------+ - - DIP Switches DIP Switches - 7 8 6 5 - =================== =================== - On On UNUSED, MUST BE ON. On On First Card <==== Default - On Off Second Card - Off On Third Card - Off Off Fourth Card - - DIP Switches I/O Address Range - 4 3 2 1 Used by the First Card - ===================================== - On Off On Off 100-143 - On Off Off On 140-183 - On Off Off Off 180-1C3 <==== Default - Off On On Off 200-243 - Off On Off On 240-283 - Off On Off Off 280-2C3 - Off Off On Off 300-343 - Off Off Off On 340-383 - Off Off Off Off 380-3C3 - -Reporting Bugs --------------- - -For technical support, please provide the following -information: Driver version, kernel release, distribution of -kernel, and type of board you are using. Error messages and log -printouts port configuration details are especially helpful. - -USA: - :Phone: (612) 494-4100 - :FAX: (612) 494-4199 - :email: support@comtrol.com - -Comtrol Europe: - :Phone: +44 (0) 1 869 323-220 - :FAX: +44 (0) 1 869 323-211 - :email: support@comtrol.co.uk - -Web: http://www.comtrol.com -FTP: ftp.comtrol.com diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/serial/serial-iso7816.rst deleted file mode 100644 index d990143de0c6..000000000000 --- a/Documentation/serial/serial-iso7816.rst +++ /dev/null @@ -1,90 +0,0 @@ -============================= -ISO7816 Serial Communications -============================= - -1. Introduction -=============== - - ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC) - also known as smart cards. - -2. Hardware-related considerations -================================== - - Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of - handling communication with a smart card. - - For these microcontrollers, the Linux driver should be made capable of - working in both modes, and proper ioctls (see later) should be made - available at user-level to allow switching from one mode to the other, and - vice versa. - -3. Data Structures Already Available in the Kernel -================================================== - - The Linux kernel provides the serial_iso7816 structure (see [1]) to handle - ISO7816 communications. This data structure is used to set and configure - ISO7816 parameters in ioctls. - - Any driver for devices capable of working both as RS232 and ISO7816 should - implement the iso7816_config callback in the uart_port structure. The - serial_core calls iso7816_config to do the device specific part in response - to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config - callback receives a pointer to struct serial_iso7816. - -4. Usage from user-level -======================== - - From user-level, ISO7816 configuration can be get/set using the previous - ioctls. For instance, to set ISO7816 you can use the following code:: - - #include - - /* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */ - #include - - /* Open your specific device (e.g., /dev/mydevice): */ - int fd = open ("/dev/mydevice", O_RDWR); - if (fd < 0) { - /* Error handling. See errno. */ - } - - struct serial_iso7816 iso7816conf; - - /* Reserved fields as to be zeroed */ - memset(&iso7816conf, 0, sizeof(iso7816conf)); - - /* Enable ISO7816 mode: */ - iso7816conf.flags |= SER_ISO7816_ENABLED; - - /* Select the protocol: */ - /* T=0 */ - iso7816conf.flags |= SER_ISO7816_T(0); - /* or T=1 */ - iso7816conf.flags |= SER_ISO7816_T(1); - - /* Set the guard time: */ - iso7816conf.tg = 2; - - /* Set the clock frequency*/ - iso7816conf.clk = 3571200; - - /* Set transmission factors: */ - iso7816conf.sc_fi = 372; - iso7816conf.sc_di = 1; - - if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) { - /* Error handling. See errno. */ - } - - /* Use read() and write() syscalls here... */ - - /* Close the device when finished: */ - if (close (fd) < 0) { - /* Error handling. See errno. */ - } - -5. References -============= - - [1] include/uapi/linux/serial.h diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/serial/serial-rs485.rst deleted file mode 100644 index 6bc824f948f9..000000000000 --- a/Documentation/serial/serial-rs485.rst +++ /dev/null @@ -1,103 +0,0 @@ -=========================== -RS485 Serial Communications -=========================== - -1. Introduction -=============== - - EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the - electrical characteristics of drivers and receivers for use in balanced - digital multipoint systems. - This standard is widely used for communications in industrial automation - because it can be used effectively over long distances and in electrically - noisy environments. - -2. Hardware-related Considerations -================================== - - Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in - half-duplex mode capable of automatically controlling line direction by - toggling RTS or DTR signals. That can be used to control external - half-duplex hardware like an RS485 transceiver or any RS232-connected - half-duplex devices like some modems. - - For these microcontrollers, the Linux driver should be made capable of - working in both modes, and proper ioctls (see later) should be made - available at user-level to allow switching from one mode to the other, and - vice versa. - -3. Data Structures Already Available in the Kernel -================================================== - - The Linux kernel provides the serial_rs485 structure (see [1]) to handle - RS485 communications. This data structure is used to set and configure RS485 - parameters in the platform data and in ioctls. - - The device tree can also provide RS485 boot time parameters (see [2] - for bindings). The driver is in charge of filling this data structure from - the values given by the device tree. - - Any driver for devices capable of working both as RS232 and RS485 should - implement the rs485_config callback in the uart_port structure. The - serial_core calls rs485_config to do the device specific part in response - to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback - receives a pointer to struct serial_rs485. - -4. Usage from user-level -======================== - - From user-level, RS485 configuration can be get/set using the previous - ioctls. For instance, to set RS485 you can use the following code:: - - #include - - /* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */ - #include - - /* Open your specific device (e.g., /dev/mydevice): */ - int fd = open ("/dev/mydevice", O_RDWR); - if (fd < 0) { - /* Error handling. See errno. */ - } - - struct serial_rs485 rs485conf; - - /* Enable RS485 mode: */ - rs485conf.flags |= SER_RS485_ENABLED; - - /* Set logical level for RTS pin equal to 1 when sending: */ - rs485conf.flags |= SER_RS485_RTS_ON_SEND; - /* or, set logical level for RTS pin equal to 0 when sending: */ - rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND); - - /* Set logical level for RTS pin equal to 1 after sending: */ - rs485conf.flags |= SER_RS485_RTS_AFTER_SEND; - /* or, set logical level for RTS pin equal to 0 after sending: */ - rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND); - - /* Set rts delay before send, if needed: */ - rs485conf.delay_rts_before_send = ...; - - /* Set rts delay after send, if needed: */ - rs485conf.delay_rts_after_send = ...; - - /* Set this flag if you want to receive data even while sending data */ - rs485conf.flags |= SER_RS485_RX_DURING_TX; - - if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) { - /* Error handling. See errno. */ - } - - /* Use read() and write() syscalls here... */ - - /* Close the device when finished: */ - if (close (fd) < 0) { - /* Error handling. See errno. */ - } - -5. References -============= - - [1] include/uapi/linux/serial.h - - [2] Documentation/devicetree/bindings/serial/rs485.txt diff --git a/Documentation/serial/tty.rst b/Documentation/serial/tty.rst deleted file mode 100644 index dd972caacf3e..000000000000 --- a/Documentation/serial/tty.rst +++ /dev/null @@ -1,328 +0,0 @@ -================= -The Lockronomicon -================= - -Your guide to the ancient and twisted locking policies of the tty layer and -the warped logic behind them. Beware all ye who read on. - - -Line Discipline ---------------- - -Line disciplines are registered with tty_register_ldisc() passing the -discipline number and the ldisc structure. At the point of registration the -discipline must be ready to use and it is possible it will get used before -the call returns success. If the call returns an error then it won't get -called. Do not re-use ldisc numbers as they are part of the userspace ABI -and writing over an existing ldisc will cause demons to eat your computer. -After the return the ldisc data has been copied so you may free your own -copy of the structure. You must not re-register over the top of the line -discipline even with the same data or your computer again will be eaten by -demons. - -In order to remove a line discipline call tty_unregister_ldisc(). -In ancient times this always worked. In modern times the function will -return -EBUSY if the ldisc is currently in use. Since the ldisc referencing -code manages the module counts this should not usually be a concern. - -Heed this warning: the reference count field of the registered copies of the -tty_ldisc structure in the ldisc table counts the number of lines using this -discipline. The reference count of the tty_ldisc structure within a tty -counts the number of active users of the ldisc at this instant. In effect it -counts the number of threads of execution within an ldisc method (plus those -about to enter and exit although this detail matters not). - -Line Discipline Methods ------------------------ - -TTY side interfaces -^^^^^^^^^^^^^^^^^^^ - -======================= ======================================================= -open() Called when the line discipline is attached to - the terminal. No other call into the line - discipline for this tty will occur until it - completes successfully. Should initialize any - state needed by the ldisc, and set receive_room - in the tty_struct to the maximum amount of data - the line discipline is willing to accept from the - driver with a single call to receive_buf(). - Returning an error will prevent the ldisc from - being attached. Can sleep. - -close() This is called on a terminal when the line - discipline is being unplugged. At the point of - execution no further users will enter the - ldisc code for this tty. Can sleep. - -hangup() Called when the tty line is hung up. - The line discipline should cease I/O to the tty. - No further calls into the ldisc code will occur. - The return value is ignored. Can sleep. - -read() (optional) A process requests reading data from - the line. Multiple read calls may occur in parallel - and the ldisc must deal with serialization issues. - If not defined, the process will receive an EIO - error. May sleep. - -write() (optional) A process requests writing data to the - line. Multiple write calls are serialized by the - tty layer for the ldisc. If not defined, the - process will receive an EIO error. May sleep. - -flush_buffer() (optional) May be called at any point between - open and close, and instructs the line discipline - to empty its input buffer. - -set_termios() (optional) Called on termios structure changes. - The caller passes the old termios data and the - current data is in the tty. Called under the - termios semaphore so allowed to sleep. Serialized - against itself only. - -poll() (optional) Check the status for the poll/select - calls. Multiple poll calls may occur in parallel. - May sleep. - -ioctl() (optional) Called when an ioctl is handed to the - tty layer that might be for the ldisc. Multiple - ioctl calls may occur in parallel. May sleep. - -compat_ioctl() (optional) Called when a 32 bit ioctl is handed - to the tty layer that might be for the ldisc. - Multiple ioctl calls may occur in parallel. - May sleep. -======================= ======================================================= - -Driver Side Interfaces -^^^^^^^^^^^^^^^^^^^^^^ - -======================= ======================================================= -receive_buf() (optional) Called by the low-level driver to hand - a buffer of received bytes to the ldisc for - processing. The number of bytes is guaranteed not - to exceed the current value of tty->receive_room. - All bytes must be processed. - -receive_buf2() (optional) Called by the low-level driver to hand - a buffer of received bytes to the ldisc for - processing. Returns the number of bytes processed. - - If both receive_buf() and receive_buf2() are - defined, receive_buf2() should be preferred. - -write_wakeup() May be called at any point between open and close. - The TTY_DO_WRITE_WAKEUP flag indicates if a call - is needed but always races versus calls. Thus the - ldisc must be careful about setting order and to - handle unexpected calls. Must not sleep. - - The driver is forbidden from calling this directly - from the ->write call from the ldisc as the ldisc - is permitted to call the driver write method from - this function. In such a situation defer it. - -dcd_change() Report to the tty line the current DCD pin status - changes and the relative timestamp. The timestamp - cannot be NULL. -======================= ======================================================= - - -Driver Access -^^^^^^^^^^^^^ - -Line discipline methods can call the following methods of the underlying -hardware driver through the function pointers within the tty->driver -structure: - -======================= ======================================================= -write() Write a block of characters to the tty device. - Returns the number of characters accepted. The - character buffer passed to this method is already - in kernel space. - -put_char() Queues a character for writing to the tty device. - If there is no room in the queue, the character is - ignored. - -flush_chars() (Optional) If defined, must be called after - queueing characters with put_char() in order to - start transmission. - -write_room() Returns the numbers of characters the tty driver - will accept for queueing to be written. - -ioctl() Invoke device specific ioctl. - Expects data pointers to refer to userspace. - Returns ENOIOCTLCMD for unrecognized ioctl numbers. - -set_termios() Notify the tty driver that the device's termios - settings have changed. New settings are in - tty->termios. Previous settings should be passed in - the "old" argument. - - The API is defined such that the driver should return - the actual modes selected. This means that the - driver function is responsible for modifying any - bits in the request it cannot fulfill to indicate - the actual modes being used. A device with no - hardware capability for change (e.g. a USB dongle or - virtual port) can provide NULL for this method. - -throttle() Notify the tty driver that input buffers for the - line discipline are close to full, and it should - somehow signal that no more characters should be - sent to the tty. - -unthrottle() Notify the tty driver that characters can now be - sent to the tty without fear of overrunning the - input buffers of the line disciplines. - -stop() Ask the tty driver to stop outputting characters - to the tty device. - -start() Ask the tty driver to resume sending characters - to the tty device. - -hangup() Ask the tty driver to hang up the tty device. - -break_ctl() (Optional) Ask the tty driver to turn on or off - BREAK status on the RS-232 port. If state is -1, - then the BREAK status should be turned on; if - state is 0, then BREAK should be turned off. - If this routine is not implemented, use ioctls - TIOCSBRK / TIOCCBRK instead. - -wait_until_sent() Waits until the device has written out all of the - characters in its transmitter FIFO. - -send_xchar() Send a high-priority XON/XOFF character to the device. -======================= ======================================================= - - -Flags -^^^^^ - -Line discipline methods have access to tty->flags field containing the -following interesting flags: - -======================= ======================================================= -TTY_THROTTLED Driver input is throttled. The ldisc should call - tty->driver->unthrottle() in order to resume - reception when it is ready to process more data. - -TTY_DO_WRITE_WAKEUP If set, causes the driver to call the ldisc's - write_wakeup() method in order to resume - transmission when it can accept more data - to transmit. - -TTY_IO_ERROR If set, causes all subsequent userspace read/write - calls on the tty to fail, returning -EIO. - -TTY_OTHER_CLOSED Device is a pty and the other side has closed. - -TTY_NO_WRITE_SPLIT Prevent driver from splitting up writes into - smaller chunks. -======================= ======================================================= - - -Locking -^^^^^^^ - -Callers to the line discipline functions from the tty layer are required to -take line discipline locks. The same is true of calls from the driver side -but not yet enforced. - -Three calls are now provided:: - - ldisc = tty_ldisc_ref(tty); - -takes a handle to the line discipline in the tty and returns it. If no ldisc -is currently attached or the ldisc is being closed and re-opened at this -point then NULL is returned. While this handle is held the ldisc will not -change or go away:: - - tty_ldisc_deref(ldisc) - -Returns the ldisc reference and allows the ldisc to be closed. Returning the -reference takes away your right to call the ldisc functions until you take -a new reference:: - - ldisc = tty_ldisc_ref_wait(tty); - -Performs the same function as tty_ldisc_ref except that it will wait for an -ldisc change to complete and then return a reference to the new ldisc. - -While these functions are slightly slower than the old code they should have -minimal impact as most receive logic uses the flip buffers and they only -need to take a reference when they push bits up through the driver. - -A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc -functions are called with the ldisc unavailable. Thus tty_ldisc_ref will -fail in this situation if used within these functions. Ldisc and driver -code calling its own functions must be careful in this case. - - -Driver Interface ----------------- - -======================= ======================================================= -open() Called when a device is opened. May sleep - -close() Called when a device is closed. At the point of - return from this call the driver must make no - further ldisc calls of any kind. May sleep - -write() Called to write bytes to the device. May not - sleep. May occur in parallel in special cases. - Because this includes panic paths drivers generally - shouldn't try and do clever locking here. - -put_char() Stuff a single character onto the queue. The - driver is guaranteed following up calls to - flush_chars. - -flush_chars() Ask the kernel to write put_char queue - -write_room() Return the number of characters that can be stuffed - into the port buffers without overflow (or less). - The ldisc is responsible for being intelligent - about multi-threading of write_room/write calls - -ioctl() Called when an ioctl may be for the driver - -set_termios() Called on termios change, serialized against - itself by a semaphore. May sleep. - -set_ldisc() Notifier for discipline change. At the point this - is done the discipline is not yet usable. Can now - sleep (I think) - -throttle() Called by the ldisc to ask the driver to do flow - control. Serialization including with unthrottle - is the job of the ldisc layer. - -unthrottle() Called by the ldisc to ask the driver to stop flow - control. - -stop() Ldisc notifier to the driver to stop output. As with - throttle the serializations with start() are down - to the ldisc layer. - -start() Ldisc notifier to the driver to start output. - -hangup() Ask the tty driver to cause a hangup initiated - from the host side. [Can sleep ??] - -break_ctl() Send RS232 break. Can sleep. Can get called in - parallel, driver must serialize (for now), and - with write calls. - -wait_until_sent() Wait for characters to exit the hardware queue - of the driver. Can sleep - -send_xchar() Send XON/XOFF and if possible jump the queue with - it in order to get fast flow control responses. - Cannot sleep ?? -======================= ======================================================= -- cgit From 4745dc8abb0a0a9851c07265eea01d844886d5c8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 16:36:04 -0300 Subject: docs: phy: place documentation under driver-api This subsystem-specific documentation belongs to the driver-api. Signed-off-by: Mauro Carvalho Chehab --- .../devicetree/bindings/phy/phy-bindings.txt | 2 +- .../devicetree/bindings/phy/phy-pxa-usb.txt | 2 +- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/phy/index.rst | 16 ++ Documentation/driver-api/phy/phy.rst | 197 +++++++++++++++++++++ Documentation/driver-api/phy/samsung-usb2.rst | 137 ++++++++++++++ Documentation/index.rst | 1 - Documentation/phy.txt | 197 --------------------- Documentation/phy/samsung-usb2.rst | 137 -------------- 9 files changed, 353 insertions(+), 337 deletions(-) create mode 100644 Documentation/driver-api/phy/index.rst create mode 100644 Documentation/driver-api/phy/phy.rst create mode 100644 Documentation/driver-api/phy/samsung-usb2.rst delete mode 100644 Documentation/phy.txt delete mode 100644 Documentation/phy/samsung-usb2.rst (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/phy-bindings.txt b/Documentation/devicetree/bindings/phy/phy-bindings.txt index a403b81d0679..c4eb38902533 100644 --- a/Documentation/devicetree/bindings/phy/phy-bindings.txt +++ b/Documentation/devicetree/bindings/phy/phy-bindings.txt @@ -1,5 +1,5 @@ This document explains only the device tree data binding. For general -information about PHY subsystem refer to Documentation/phy.txt +information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst PHY device node =============== diff --git a/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt index 93fc09c12954..d80e36a77ec5 100644 --- a/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt +++ b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt @@ -15,4 +15,4 @@ Example: }; This document explains the device tree binding. For general -information about PHY subsystem refer to Documentation/phy.txt +information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index cf39b8f9d0f9..eff22db0ed14 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -85,6 +85,7 @@ available subsections can be seen below. parport-lowlevel pps ptp + phy/index pti_intel_mid pwm rfkill diff --git a/Documentation/driver-api/phy/index.rst b/Documentation/driver-api/phy/index.rst new file mode 100644 index 000000000000..fce9ffae2812 --- /dev/null +++ b/Documentation/driver-api/phy/index.rst @@ -0,0 +1,16 @@ +===================== +Generic PHY Framework +===================== + +.. toctree:: + + phy + samsung-usb2 + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` + diff --git a/Documentation/driver-api/phy/phy.rst b/Documentation/driver-api/phy/phy.rst new file mode 100644 index 000000000000..457c3e0f86d6 --- /dev/null +++ b/Documentation/driver-api/phy/phy.rst @@ -0,0 +1,197 @@ +============= +PHY subsystem +============= + +:Author: Kishon Vijay Abraham I + +This document explains the Generic PHY Framework along with the APIs provided, +and how-to-use. + +Introduction +============ + +*PHY* is the abbreviation for physical layer. It is used to connect a device +to the physical medium e.g., the USB controller has a PHY to provide functions +such as serialization, de-serialization, encoding, decoding and is responsible +for obtaining the required data transmission rate. Note that some USB +controllers have PHY functionality embedded into it and others use an external +PHY. Other peripherals that use PHY include Wireless LAN, Ethernet, +SATA etc. + +The intention of creating this framework is to bring the PHY drivers spread +all over the Linux kernel to drivers/phy to increase code re-use and for +better code maintainability. + +This framework will be of use only to devices that use external PHY (PHY +functionality is not embedded within the controller). + +Registering/Unregistering the PHY provider +========================================== + +PHY provider refers to an entity that implements one or more PHY instances. +For the simple case where the PHY provider implements only a single instance of +the PHY, the framework provides its own implementation of of_xlate in +of_phy_simple_xlate. If the PHY provider implements multiple instances, it +should provide its own implementation of of_xlate. of_xlate is used only for +dt boot case. + +:: + + #define of_phy_provider_register(dev, xlate) \ + __of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate)) + + #define devm_of_phy_provider_register(dev, xlate) \ + __devm_of_phy_provider_register((dev), NULL, THIS_MODULE, + (xlate)) + +of_phy_provider_register and devm_of_phy_provider_register macros can be used to +register the phy_provider and it takes device and of_xlate as +arguments. For the dt boot case, all PHY providers should use one of the above +2 macros to register the PHY provider. + +Often the device tree nodes associated with a PHY provider will contain a set +of children that each represent a single PHY. Some bindings may nest the child +nodes within extra levels for context and extensibility, in which case the low +level of_phy_provider_register_full() and devm_of_phy_provider_register_full() +macros can be used to override the node containing the children. + +:: + + #define of_phy_provider_register_full(dev, children, xlate) \ + __of_phy_provider_register(dev, children, THIS_MODULE, xlate) + + #define devm_of_phy_provider_register_full(dev, children, xlate) \ + __devm_of_phy_provider_register_full(dev, children, + THIS_MODULE, xlate) + + void devm_of_phy_provider_unregister(struct device *dev, + struct phy_provider *phy_provider); + void of_phy_provider_unregister(struct phy_provider *phy_provider); + +devm_of_phy_provider_unregister and of_phy_provider_unregister can be used to +unregister the PHY. + +Creating the PHY +================ + +The PHY driver should create the PHY in order for other peripheral controllers +to make use of it. The PHY framework provides 2 APIs to create the PHY. + +:: + + struct phy *phy_create(struct device *dev, struct device_node *node, + const struct phy_ops *ops); + struct phy *devm_phy_create(struct device *dev, + struct device_node *node, + const struct phy_ops *ops); + +The PHY drivers can use one of the above 2 APIs to create the PHY by passing +the device pointer and phy ops. +phy_ops is a set of function pointers for performing PHY operations such as +init, exit, power_on and power_off. + +Inorder to dereference the private data (in phy_ops), the phy provider driver +can use phy_set_drvdata() after creating the PHY and use phy_get_drvdata() in +phy_ops to get back the private data. + +4. Getting a reference to the PHY + +Before the controller can make use of the PHY, it has to get a reference to +it. This framework provides the following APIs to get a reference to the PHY. + +:: + + struct phy *phy_get(struct device *dev, const char *string); + struct phy *phy_optional_get(struct device *dev, const char *string); + struct phy *devm_phy_get(struct device *dev, const char *string); + struct phy *devm_phy_optional_get(struct device *dev, + const char *string); + struct phy *devm_of_phy_get_by_index(struct device *dev, + struct device_node *np, + int index); + +phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can +be used to get the PHY. In the case of dt boot, the string arguments +should contain the phy name as given in the dt data and in the case of +non-dt boot, it should contain the label of the PHY. The two +devm_phy_get associates the device with the PHY using devres on +successful PHY get. On driver detach, release function is invoked on +the devres data and devres data is freed. phy_optional_get and +devm_phy_optional_get should be used when the phy is optional. These +two functions will never return -ENODEV, but instead returns NULL when +the phy cannot be found.Some generic drivers, such as ehci, may use multiple +phys and for such drivers referencing phy(s) by name(s) does not make sense. In +this case, devm_of_phy_get_by_index can be used to get a phy reference based on +the index. + +It should be noted that NULL is a valid phy reference. All phy +consumer calls on the NULL phy become NOPs. That is the release calls, +the phy_init() and phy_exit() calls, and phy_power_on() and +phy_power_off() calls are all NOP when applied to a NULL phy. The NULL +phy is useful in devices for handling optional phy devices. + +Releasing a reference to the PHY +================================ + +When the controller no longer needs the PHY, it has to release the reference +to the PHY it has obtained using the APIs mentioned in the above section. The +PHY framework provides 2 APIs to release a reference to the PHY. + +:: + + void phy_put(struct phy *phy); + void devm_phy_put(struct device *dev, struct phy *phy); + +Both these APIs are used to release a reference to the PHY and devm_phy_put +destroys the devres associated with this PHY. + +Destroying the PHY +================== + +When the driver that created the PHY is unloaded, it should destroy the PHY it +created using one of the following 2 APIs:: + + void phy_destroy(struct phy *phy); + void devm_phy_destroy(struct device *dev, struct phy *phy); + +Both these APIs destroy the PHY and devm_phy_destroy destroys the devres +associated with this PHY. + +PM Runtime +========== + +This subsystem is pm runtime enabled. So while creating the PHY, +pm_runtime_enable of the phy device created by this subsystem is called and +while destroying the PHY, pm_runtime_disable is called. Note that the phy +device created by this subsystem will be a child of the device that calls +phy_create (PHY provider device). + +So pm_runtime_get_sync of the phy_device created by this subsystem will invoke +pm_runtime_get_sync of PHY provider device because of parent-child relationship. +It should also be noted that phy_power_on and phy_power_off performs +phy_pm_runtime_get_sync and phy_pm_runtime_put respectively. +There are exported APIs like phy_pm_runtime_get, phy_pm_runtime_get_sync, +phy_pm_runtime_put, phy_pm_runtime_put_sync, phy_pm_runtime_allow and +phy_pm_runtime_forbid for performing PM operations. + +PHY Mappings +============ + +In order to get reference to a PHY without help from DeviceTree, the framework +offers lookups which can be compared to clkdev that allow clk structures to be +bound to devices. A lookup can be made be made during runtime when a handle to +the struct phy already exists. + +The framework offers the following API for registering and unregistering the +lookups:: + + int phy_create_lookup(struct phy *phy, const char *con_id, + const char *dev_id); + void phy_remove_lookup(struct phy *phy, const char *con_id, + const char *dev_id); + +DeviceTree Binding +================== + +The documentation for PHY dt binding can be found @ +Documentation/devicetree/bindings/phy/phy-bindings.txt diff --git a/Documentation/driver-api/phy/samsung-usb2.rst b/Documentation/driver-api/phy/samsung-usb2.rst new file mode 100644 index 000000000000..c48c8b9797b9 --- /dev/null +++ b/Documentation/driver-api/phy/samsung-usb2.rst @@ -0,0 +1,137 @@ +==================================== +Samsung USB 2.0 PHY adaptation layer +==================================== + +1. Description +-------------- + +The architecture of the USB 2.0 PHY module in Samsung SoCs is similar +among many SoCs. In spite of the similarities it proved difficult to +create a one driver that would fit all these PHY controllers. Often +the differences were minor and were found in particular bits of the +registers of the PHY. In some rare cases the order of register writes or +the PHY powering up process had to be altered. This adaptation layer is +a compromise between having separate drivers and having a single driver +with added support for many special cases. + +2. Files description +-------------------- + +- phy-samsung-usb2.c + This is the main file of the adaptation layer. This file contains + the probe function and provides two callbacks to the Generic PHY + Framework. This two callbacks are used to power on and power off the + phy. They carry out the common work that has to be done on all version + of the PHY module. Depending on which SoC was chosen they execute SoC + specific callbacks. The specific SoC version is selected by choosing + the appropriate compatible string. In addition, this file contains + struct of_device_id definitions for particular SoCs. + +- phy-samsung-usb2.h + This is the include file. It declares the structures used by this + driver. In addition it should contain extern declarations for + structures that describe particular SoCs. + +3. Supporting SoCs +------------------ + +To support a new SoC a new file should be added to the drivers/phy +directory. Each SoC's configuration is stored in an instance of the +struct samsung_usb2_phy_config:: + + struct samsung_usb2_phy_config { + const struct samsung_usb2_common_phy *phys; + int (*rate_to_clk)(unsigned long, u32 *); + unsigned int num_phys; + bool has_mode_switch; + }; + +The num_phys is the number of phys handled by the driver. `*phys` is an +array that contains the configuration for each phy. The has_mode_switch +property is a boolean flag that determines whether the SoC has USB host +and device on a single pair of pins. If so, a special register has to +be modified to change the internal routing of these pins between a USB +device or host module. + +For example the configuration for Exynos 4210 is following:: + + const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = { + .has_mode_switch = 0, + .num_phys = EXYNOS4210_NUM_PHYS, + .phys = exynos4210_phys, + .rate_to_clk = exynos4210_rate_to_clk, + } + +- `int (*rate_to_clk)(unsigned long, u32 *)` + + The rate_to_clk callback is to convert the rate of the clock + used as the reference clock for the PHY module to the value + that should be written in the hardware register. + +The exynos4210_phys configuration array is as follows:: + + static const struct samsung_usb2_common_phy exynos4210_phys[] = { + { + .label = "device", + .id = EXYNOS4210_DEVICE, + .power_on = exynos4210_power_on, + .power_off = exynos4210_power_off, + }, + { + .label = "host", + .id = EXYNOS4210_HOST, + .power_on = exynos4210_power_on, + .power_off = exynos4210_power_off, + }, + { + .label = "hsic0", + .id = EXYNOS4210_HSIC0, + .power_on = exynos4210_power_on, + .power_off = exynos4210_power_off, + }, + { + .label = "hsic1", + .id = EXYNOS4210_HSIC1, + .power_on = exynos4210_power_on, + .power_off = exynos4210_power_off, + }, + {}, + }; + +- `int (*power_on)(struct samsung_usb2_phy_instance *);` + `int (*power_off)(struct samsung_usb2_phy_instance *);` + + These two callbacks are used to power on and power off the phy + by modifying appropriate registers. + +Final change to the driver is adding appropriate compatible value to the +phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were +added to the struct of_device_id samsung_usb2_phy_of_match[] array:: + + #ifdef CONFIG_PHY_EXYNOS4210_USB2 + { + .compatible = "samsung,exynos4210-usb2-phy", + .data = &exynos4210_usb2_phy_config, + }, + #endif + +To add further flexibility to the driver the Kconfig file enables to +include support for selected SoCs in the compiled driver. The Kconfig +entry for Exynos 4210 is following:: + + config PHY_EXYNOS4210_USB2 + bool "Support for Exynos 4210" + depends on PHY_SAMSUNG_USB2 + depends on CPU_EXYNOS4210 + help + Enable USB PHY support for Exynos 4210. This option requires that + Samsung USB 2.0 PHY driver is enabled and means that support for this + particular SoC is compiled in the driver. In case of Exynos 4210 four + phys are available - device, host, HSCI0 and HSCI1. + +The newly created file that supports the new SoC has to be also added to the +Makefile. In case of Exynos 4210 the added line is following:: + + obj-$(CONFIG_PHY_EXYNOS4210_USB2) += phy-exynos4210-usb2.o + +After completing these steps the support for the new SoC should be ready. diff --git a/Documentation/index.rst b/Documentation/index.rst index 041ffe442960..dbfec00ba535 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -111,7 +111,6 @@ needed). usb/index misc-devices/index mic/index - phy/samsung-usb2 scheduler/index Architecture-specific documentation diff --git a/Documentation/phy.txt b/Documentation/phy.txt deleted file mode 100644 index 457c3e0f86d6..000000000000 --- a/Documentation/phy.txt +++ /dev/null @@ -1,197 +0,0 @@ -============= -PHY subsystem -============= - -:Author: Kishon Vijay Abraham I - -This document explains the Generic PHY Framework along with the APIs provided, -and how-to-use. - -Introduction -============ - -*PHY* is the abbreviation for physical layer. It is used to connect a device -to the physical medium e.g., the USB controller has a PHY to provide functions -such as serialization, de-serialization, encoding, decoding and is responsible -for obtaining the required data transmission rate. Note that some USB -controllers have PHY functionality embedded into it and others use an external -PHY. Other peripherals that use PHY include Wireless LAN, Ethernet, -SATA etc. - -The intention of creating this framework is to bring the PHY drivers spread -all over the Linux kernel to drivers/phy to increase code re-use and for -better code maintainability. - -This framework will be of use only to devices that use external PHY (PHY -functionality is not embedded within the controller). - -Registering/Unregistering the PHY provider -========================================== - -PHY provider refers to an entity that implements one or more PHY instances. -For the simple case where the PHY provider implements only a single instance of -the PHY, the framework provides its own implementation of of_xlate in -of_phy_simple_xlate. If the PHY provider implements multiple instances, it -should provide its own implementation of of_xlate. of_xlate is used only for -dt boot case. - -:: - - #define of_phy_provider_register(dev, xlate) \ - __of_phy_provider_register((dev), NULL, THIS_MODULE, (xlate)) - - #define devm_of_phy_provider_register(dev, xlate) \ - __devm_of_phy_provider_register((dev), NULL, THIS_MODULE, - (xlate)) - -of_phy_provider_register and devm_of_phy_provider_register macros can be used to -register the phy_provider and it takes device and of_xlate as -arguments. For the dt boot case, all PHY providers should use one of the above -2 macros to register the PHY provider. - -Often the device tree nodes associated with a PHY provider will contain a set -of children that each represent a single PHY. Some bindings may nest the child -nodes within extra levels for context and extensibility, in which case the low -level of_phy_provider_register_full() and devm_of_phy_provider_register_full() -macros can be used to override the node containing the children. - -:: - - #define of_phy_provider_register_full(dev, children, xlate) \ - __of_phy_provider_register(dev, children, THIS_MODULE, xlate) - - #define devm_of_phy_provider_register_full(dev, children, xlate) \ - __devm_of_phy_provider_register_full(dev, children, - THIS_MODULE, xlate) - - void devm_of_phy_provider_unregister(struct device *dev, - struct phy_provider *phy_provider); - void of_phy_provider_unregister(struct phy_provider *phy_provider); - -devm_of_phy_provider_unregister and of_phy_provider_unregister can be used to -unregister the PHY. - -Creating the PHY -================ - -The PHY driver should create the PHY in order for other peripheral controllers -to make use of it. The PHY framework provides 2 APIs to create the PHY. - -:: - - struct phy *phy_create(struct device *dev, struct device_node *node, - const struct phy_ops *ops); - struct phy *devm_phy_create(struct device *dev, - struct device_node *node, - const struct phy_ops *ops); - -The PHY drivers can use one of the above 2 APIs to create the PHY by passing -the device pointer and phy ops. -phy_ops is a set of function pointers for performing PHY operations such as -init, exit, power_on and power_off. - -Inorder to dereference the private data (in phy_ops), the phy provider driver -can use phy_set_drvdata() after creating the PHY and use phy_get_drvdata() in -phy_ops to get back the private data. - -4. Getting a reference to the PHY - -Before the controller can make use of the PHY, it has to get a reference to -it. This framework provides the following APIs to get a reference to the PHY. - -:: - - struct phy *phy_get(struct device *dev, const char *string); - struct phy *phy_optional_get(struct device *dev, const char *string); - struct phy *devm_phy_get(struct device *dev, const char *string); - struct phy *devm_phy_optional_get(struct device *dev, - const char *string); - struct phy *devm_of_phy_get_by_index(struct device *dev, - struct device_node *np, - int index); - -phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can -be used to get the PHY. In the case of dt boot, the string arguments -should contain the phy name as given in the dt data and in the case of -non-dt boot, it should contain the label of the PHY. The two -devm_phy_get associates the device with the PHY using devres on -successful PHY get. On driver detach, release function is invoked on -the devres data and devres data is freed. phy_optional_get and -devm_phy_optional_get should be used when the phy is optional. These -two functions will never return -ENODEV, but instead returns NULL when -the phy cannot be found.Some generic drivers, such as ehci, may use multiple -phys and for such drivers referencing phy(s) by name(s) does not make sense. In -this case, devm_of_phy_get_by_index can be used to get a phy reference based on -the index. - -It should be noted that NULL is a valid phy reference. All phy -consumer calls on the NULL phy become NOPs. That is the release calls, -the phy_init() and phy_exit() calls, and phy_power_on() and -phy_power_off() calls are all NOP when applied to a NULL phy. The NULL -phy is useful in devices for handling optional phy devices. - -Releasing a reference to the PHY -================================ - -When the controller no longer needs the PHY, it has to release the reference -to the PHY it has obtained using the APIs mentioned in the above section. The -PHY framework provides 2 APIs to release a reference to the PHY. - -:: - - void phy_put(struct phy *phy); - void devm_phy_put(struct device *dev, struct phy *phy); - -Both these APIs are used to release a reference to the PHY and devm_phy_put -destroys the devres associated with this PHY. - -Destroying the PHY -================== - -When the driver that created the PHY is unloaded, it should destroy the PHY it -created using one of the following 2 APIs:: - - void phy_destroy(struct phy *phy); - void devm_phy_destroy(struct device *dev, struct phy *phy); - -Both these APIs destroy the PHY and devm_phy_destroy destroys the devres -associated with this PHY. - -PM Runtime -========== - -This subsystem is pm runtime enabled. So while creating the PHY, -pm_runtime_enable of the phy device created by this subsystem is called and -while destroying the PHY, pm_runtime_disable is called. Note that the phy -device created by this subsystem will be a child of the device that calls -phy_create (PHY provider device). - -So pm_runtime_get_sync of the phy_device created by this subsystem will invoke -pm_runtime_get_sync of PHY provider device because of parent-child relationship. -It should also be noted that phy_power_on and phy_power_off performs -phy_pm_runtime_get_sync and phy_pm_runtime_put respectively. -There are exported APIs like phy_pm_runtime_get, phy_pm_runtime_get_sync, -phy_pm_runtime_put, phy_pm_runtime_put_sync, phy_pm_runtime_allow and -phy_pm_runtime_forbid for performing PM operations. - -PHY Mappings -============ - -In order to get reference to a PHY without help from DeviceTree, the framework -offers lookups which can be compared to clkdev that allow clk structures to be -bound to devices. A lookup can be made be made during runtime when a handle to -the struct phy already exists. - -The framework offers the following API for registering and unregistering the -lookups:: - - int phy_create_lookup(struct phy *phy, const char *con_id, - const char *dev_id); - void phy_remove_lookup(struct phy *phy, const char *con_id, - const char *dev_id); - -DeviceTree Binding -================== - -The documentation for PHY dt binding can be found @ -Documentation/devicetree/bindings/phy/phy-bindings.txt diff --git a/Documentation/phy/samsung-usb2.rst b/Documentation/phy/samsung-usb2.rst deleted file mode 100644 index c48c8b9797b9..000000000000 --- a/Documentation/phy/samsung-usb2.rst +++ /dev/null @@ -1,137 +0,0 @@ -==================================== -Samsung USB 2.0 PHY adaptation layer -==================================== - -1. Description --------------- - -The architecture of the USB 2.0 PHY module in Samsung SoCs is similar -among many SoCs. In spite of the similarities it proved difficult to -create a one driver that would fit all these PHY controllers. Often -the differences were minor and were found in particular bits of the -registers of the PHY. In some rare cases the order of register writes or -the PHY powering up process had to be altered. This adaptation layer is -a compromise between having separate drivers and having a single driver -with added support for many special cases. - -2. Files description --------------------- - -- phy-samsung-usb2.c - This is the main file of the adaptation layer. This file contains - the probe function and provides two callbacks to the Generic PHY - Framework. This two callbacks are used to power on and power off the - phy. They carry out the common work that has to be done on all version - of the PHY module. Depending on which SoC was chosen they execute SoC - specific callbacks. The specific SoC version is selected by choosing - the appropriate compatible string. In addition, this file contains - struct of_device_id definitions for particular SoCs. - -- phy-samsung-usb2.h - This is the include file. It declares the structures used by this - driver. In addition it should contain extern declarations for - structures that describe particular SoCs. - -3. Supporting SoCs ------------------- - -To support a new SoC a new file should be added to the drivers/phy -directory. Each SoC's configuration is stored in an instance of the -struct samsung_usb2_phy_config:: - - struct samsung_usb2_phy_config { - const struct samsung_usb2_common_phy *phys; - int (*rate_to_clk)(unsigned long, u32 *); - unsigned int num_phys; - bool has_mode_switch; - }; - -The num_phys is the number of phys handled by the driver. `*phys` is an -array that contains the configuration for each phy. The has_mode_switch -property is a boolean flag that determines whether the SoC has USB host -and device on a single pair of pins. If so, a special register has to -be modified to change the internal routing of these pins between a USB -device or host module. - -For example the configuration for Exynos 4210 is following:: - - const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = { - .has_mode_switch = 0, - .num_phys = EXYNOS4210_NUM_PHYS, - .phys = exynos4210_phys, - .rate_to_clk = exynos4210_rate_to_clk, - } - -- `int (*rate_to_clk)(unsigned long, u32 *)` - - The rate_to_clk callback is to convert the rate of the clock - used as the reference clock for the PHY module to the value - that should be written in the hardware register. - -The exynos4210_phys configuration array is as follows:: - - static const struct samsung_usb2_common_phy exynos4210_phys[] = { - { - .label = "device", - .id = EXYNOS4210_DEVICE, - .power_on = exynos4210_power_on, - .power_off = exynos4210_power_off, - }, - { - .label = "host", - .id = EXYNOS4210_HOST, - .power_on = exynos4210_power_on, - .power_off = exynos4210_power_off, - }, - { - .label = "hsic0", - .id = EXYNOS4210_HSIC0, - .power_on = exynos4210_power_on, - .power_off = exynos4210_power_off, - }, - { - .label = "hsic1", - .id = EXYNOS4210_HSIC1, - .power_on = exynos4210_power_on, - .power_off = exynos4210_power_off, - }, - {}, - }; - -- `int (*power_on)(struct samsung_usb2_phy_instance *);` - `int (*power_off)(struct samsung_usb2_phy_instance *);` - - These two callbacks are used to power on and power off the phy - by modifying appropriate registers. - -Final change to the driver is adding appropriate compatible value to the -phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were -added to the struct of_device_id samsung_usb2_phy_of_match[] array:: - - #ifdef CONFIG_PHY_EXYNOS4210_USB2 - { - .compatible = "samsung,exynos4210-usb2-phy", - .data = &exynos4210_usb2_phy_config, - }, - #endif - -To add further flexibility to the driver the Kconfig file enables to -include support for selected SoCs in the compiled driver. The Kconfig -entry for Exynos 4210 is following:: - - config PHY_EXYNOS4210_USB2 - bool "Support for Exynos 4210" - depends on PHY_SAMSUNG_USB2 - depends on CPU_EXYNOS4210 - help - Enable USB PHY support for Exynos 4210. This option requires that - Samsung USB 2.0 PHY driver is enabled and means that support for this - particular SoC is compiled in the driver. In case of Exynos 4210 four - phys are available - device, host, HSCI0 and HSCI1. - -The newly created file that supports the new SoC has to be also added to the -Makefile. In case of Exynos 4210 the added line is following:: - - obj-$(CONFIG_PHY_EXYNOS4210_USB2) += phy-exynos4210-usb2.o - -After completing these steps the support for the new SoC should be ready. -- cgit From 652a49bc68ce3cf0355bde357b3998bd63e73915 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 15:03:13 -0300 Subject: docs: add a memory-devices subdir to driver-api There are two docs describing memory device drivers. Add both to this new chapter of the driver-api. Signed-off-by: Mauro Carvalho Chehab --- Documentation/bus-devices/ti-gpmc.rst | 179 --------------------- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/memory-devices/index.rst | 16 ++ .../driver-api/memory-devices/ti-emif.rst | 64 ++++++++ .../driver-api/memory-devices/ti-gpmc.rst | 179 +++++++++++++++++++++ Documentation/memory-devices/ti-emif.rst | 64 -------- 6 files changed, 260 insertions(+), 243 deletions(-) delete mode 100644 Documentation/bus-devices/ti-gpmc.rst create mode 100644 Documentation/driver-api/memory-devices/index.rst create mode 100644 Documentation/driver-api/memory-devices/ti-emif.rst create mode 100644 Documentation/driver-api/memory-devices/ti-gpmc.rst delete mode 100644 Documentation/memory-devices/ti-emif.rst (limited to 'Documentation') diff --git a/Documentation/bus-devices/ti-gpmc.rst b/Documentation/bus-devices/ti-gpmc.rst deleted file mode 100644 index 87c366e418be..000000000000 --- a/Documentation/bus-devices/ti-gpmc.rst +++ /dev/null @@ -1,179 +0,0 @@ -:orphan: - -======================================== -GPMC (General Purpose Memory Controller) -======================================== - -GPMC is an unified memory controller dedicated to interfacing external -memory devices like - - * Asynchronous SRAM like memories and application specific integrated - circuit devices. - * Asynchronous, synchronous, and page mode burst NOR flash devices - NAND flash - * Pseudo-SRAM devices - -GPMC is found on Texas Instruments SoC's (OMAP based) -IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1 - - -GPMC generic timing calculation: -================================ - -GPMC has certain timings that has to be programmed for proper -functioning of the peripheral, while peripheral has another set of -timings. To have peripheral work with gpmc, peripheral timings has to -be translated to the form gpmc can understand. The way it has to be -translated depends on the connected peripheral. Also there is a -dependency for certain gpmc timings on gpmc clock frequency. Hence a -generic timing routine was developed to achieve above requirements. - -Generic routine provides a generic method to calculate gpmc timings -from gpmc peripheral timings. struct gpmc_device_timings fields has to -be updated with timings from the datasheet of the peripheral that is -connected to gpmc. A few of the peripheral timings can be fed either -in time or in cycles, provision to handle this scenario has been -provided (refer struct gpmc_device_timings definition). It may so -happen that timing as specified by peripheral datasheet is not present -in timing structure, in this scenario, try to correlate peripheral -timing to the one available. If that doesn't work, try to add a new -field as required by peripheral, educate generic timing routine to -handle it, make sure that it does not break any of the existing. -Then there may be cases where peripheral datasheet doesn't mention -certain fields of struct gpmc_device_timings, zero those entries. - -Generic timing routine has been verified to work properly on -multiple onenand's and tusb6010 peripherals. - -A word of caution: generic timing routine has been developed based -on understanding of gpmc timings, peripheral timings, available -custom timing routines, a kind of reverse engineering without -most of the datasheets & hardware (to be exact none of those supported -in mainline having custom timing routine) and by simulation. - -gpmc timing dependency on peripheral timings: - -[: , ...] - -1. common - -cs_on: - t_ceasu -adv_on: - t_avdasu, t_ceavd - -2. sync common - -sync_clk: - clk -page_burst_access: - t_bacc -clk_activation: - t_ces, t_avds - -3. read async muxed - -adv_rd_off: - t_avdp_r -oe_on: - t_oeasu, t_aavdh -access: - t_iaa, t_oe, t_ce, t_aa -rd_cycle: - t_rd_cycle, t_cez_r, t_oez - -4. read async non-muxed - -adv_rd_off: - t_avdp_r -oe_on: - t_oeasu -access: - t_iaa, t_oe, t_ce, t_aa -rd_cycle: - t_rd_cycle, t_cez_r, t_oez - -5. read sync muxed - -adv_rd_off: - t_avdp_r, t_avdh -oe_on: - t_oeasu, t_ach, cyc_aavdh_oe -access: - t_iaa, cyc_iaa, cyc_oe -rd_cycle: - t_cez_r, t_oez, t_ce_rdyz - -6. read sync non-muxed - -adv_rd_off: - t_avdp_r -oe_on: - t_oeasu -access: - t_iaa, cyc_iaa, cyc_oe -rd_cycle: - t_cez_r, t_oez, t_ce_rdyz - -7. write async muxed - -adv_wr_off: - t_avdp_w -we_on, wr_data_mux_bus: - t_weasu, t_aavdh, cyc_aavhd_we -we_off: - t_wpl -cs_wr_off: - t_wph -wr_cycle: - t_cez_w, t_wr_cycle - -8. write async non-muxed - -adv_wr_off: - t_avdp_w -we_on, wr_data_mux_bus: - t_weasu -we_off: - t_wpl -cs_wr_off: - t_wph -wr_cycle: - t_cez_w, t_wr_cycle - -9. write sync muxed - -adv_wr_off: - t_avdp_w, t_avdh -we_on, wr_data_mux_bus: - t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we -we_off: - t_wpl, cyc_wpl -cs_wr_off: - t_wph -wr_cycle: - t_cez_w, t_ce_rdyz - -10. write sync non-muxed - -adv_wr_off: - t_avdp_w -we_on, wr_data_mux_bus: - t_weasu, t_rdyo -we_off: - t_wpl, cyc_wpl -cs_wr_off: - t_wph -wr_cycle: - t_cez_w, t_ce_rdyz - - -Note: - Many of gpmc timings are dependent on other gpmc timings (a few - gpmc timings purely dependent on other gpmc timings, a reason that - some of the gpmc timings are missing above), and it will result in - indirect dependency of peripheral timings to gpmc timings other than - mentioned above, refer timing routine for more details. To know what - these peripheral timings correspond to, please see explanations in - struct gpmc_device_timings definition. And for gpmc timings refer - IP details (link above). diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index eff22db0ed14..d12a80f386a6 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -79,6 +79,7 @@ available subsections can be seen below. isapnp generic-counter lightnvm-pblk + memory-devices/index men-chameleon-bus ntb nvmem diff --git a/Documentation/driver-api/memory-devices/index.rst b/Documentation/driver-api/memory-devices/index.rst new file mode 100644 index 000000000000..87549828f6ab --- /dev/null +++ b/Documentation/driver-api/memory-devices/index.rst @@ -0,0 +1,16 @@ +========================= +Memory Controller drivers +========================= + +.. toctree:: + :maxdepth: 1 + + ti-emif + ti-gpmc + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/memory-devices/ti-emif.rst b/Documentation/driver-api/memory-devices/ti-emif.rst new file mode 100644 index 000000000000..dea2ad9bcd7e --- /dev/null +++ b/Documentation/driver-api/memory-devices/ti-emif.rst @@ -0,0 +1,64 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +TI EMIF SDRAM Controller Driver +=============================== + +Author +====== +Aneesh V + +Location +======== +driver/memory/emif.c + +Supported SoCs: +=============== +TI OMAP44xx +TI OMAP54xx + +Menuconfig option: +================== +Device Drivers + Memory devices + Texas Instruments EMIF driver + +Description +=========== +This driver is for the EMIF module available in Texas Instruments +SoCs. EMIF is an SDRAM controller that, based on its revision, +supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols. +This driver takes care of only LPDDR2 memories presently. The +functions of the driver includes re-configuring AC timing +parameters and other settings during frequency, voltage and +temperature changes + +Platform Data (see include/linux/platform_data/emif_plat.h) +=========================================================== +DDR device details and other board dependent and SoC dependent +information can be passed through platform data (struct emif_platform_data) + +- DDR device details: 'struct ddr_device_info' +- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck' +- Custom configurations: customizable policy options through + 'struct emif_custom_configs' +- IP revision +- PHY type + +Interface to the external world +=============================== +EMIF driver registers notifiers for voltage and frequency changes +affecting EMIF and takes appropriate actions when these are invoked. + +- freq_pre_notify_handling() +- freq_post_notify_handling() +- volt_notify_handling() + +Debugfs +======= +The driver creates two debugfs entries per device. + +- regcache_dump : dump of register values calculated and saved for all + frequencies used so far. +- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4 + indicates the current temperature level of the device. diff --git a/Documentation/driver-api/memory-devices/ti-gpmc.rst b/Documentation/driver-api/memory-devices/ti-gpmc.rst new file mode 100644 index 000000000000..33efcb81f080 --- /dev/null +++ b/Documentation/driver-api/memory-devices/ti-gpmc.rst @@ -0,0 +1,179 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +GPMC (General Purpose Memory Controller) +======================================== + +GPMC is an unified memory controller dedicated to interfacing external +memory devices like + + * Asynchronous SRAM like memories and application specific integrated + circuit devices. + * Asynchronous, synchronous, and page mode burst NOR flash devices + NAND flash + * Pseudo-SRAM devices + +GPMC is found on Texas Instruments SoC's (OMAP based) +IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1 + + +GPMC generic timing calculation: +================================ + +GPMC has certain timings that has to be programmed for proper +functioning of the peripheral, while peripheral has another set of +timings. To have peripheral work with gpmc, peripheral timings has to +be translated to the form gpmc can understand. The way it has to be +translated depends on the connected peripheral. Also there is a +dependency for certain gpmc timings on gpmc clock frequency. Hence a +generic timing routine was developed to achieve above requirements. + +Generic routine provides a generic method to calculate gpmc timings +from gpmc peripheral timings. struct gpmc_device_timings fields has to +be updated with timings from the datasheet of the peripheral that is +connected to gpmc. A few of the peripheral timings can be fed either +in time or in cycles, provision to handle this scenario has been +provided (refer struct gpmc_device_timings definition). It may so +happen that timing as specified by peripheral datasheet is not present +in timing structure, in this scenario, try to correlate peripheral +timing to the one available. If that doesn't work, try to add a new +field as required by peripheral, educate generic timing routine to +handle it, make sure that it does not break any of the existing. +Then there may be cases where peripheral datasheet doesn't mention +certain fields of struct gpmc_device_timings, zero those entries. + +Generic timing routine has been verified to work properly on +multiple onenand's and tusb6010 peripherals. + +A word of caution: generic timing routine has been developed based +on understanding of gpmc timings, peripheral timings, available +custom timing routines, a kind of reverse engineering without +most of the datasheets & hardware (to be exact none of those supported +in mainline having custom timing routine) and by simulation. + +gpmc timing dependency on peripheral timings: + +[: , ...] + +1. common + +cs_on: + t_ceasu +adv_on: + t_avdasu, t_ceavd + +2. sync common + +sync_clk: + clk +page_burst_access: + t_bacc +clk_activation: + t_ces, t_avds + +3. read async muxed + +adv_rd_off: + t_avdp_r +oe_on: + t_oeasu, t_aavdh +access: + t_iaa, t_oe, t_ce, t_aa +rd_cycle: + t_rd_cycle, t_cez_r, t_oez + +4. read async non-muxed + +adv_rd_off: + t_avdp_r +oe_on: + t_oeasu +access: + t_iaa, t_oe, t_ce, t_aa +rd_cycle: + t_rd_cycle, t_cez_r, t_oez + +5. read sync muxed + +adv_rd_off: + t_avdp_r, t_avdh +oe_on: + t_oeasu, t_ach, cyc_aavdh_oe +access: + t_iaa, cyc_iaa, cyc_oe +rd_cycle: + t_cez_r, t_oez, t_ce_rdyz + +6. read sync non-muxed + +adv_rd_off: + t_avdp_r +oe_on: + t_oeasu +access: + t_iaa, cyc_iaa, cyc_oe +rd_cycle: + t_cez_r, t_oez, t_ce_rdyz + +7. write async muxed + +adv_wr_off: + t_avdp_w +we_on, wr_data_mux_bus: + t_weasu, t_aavdh, cyc_aavhd_we +we_off: + t_wpl +cs_wr_off: + t_wph +wr_cycle: + t_cez_w, t_wr_cycle + +8. write async non-muxed + +adv_wr_off: + t_avdp_w +we_on, wr_data_mux_bus: + t_weasu +we_off: + t_wpl +cs_wr_off: + t_wph +wr_cycle: + t_cez_w, t_wr_cycle + +9. write sync muxed + +adv_wr_off: + t_avdp_w, t_avdh +we_on, wr_data_mux_bus: + t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we +we_off: + t_wpl, cyc_wpl +cs_wr_off: + t_wph +wr_cycle: + t_cez_w, t_ce_rdyz + +10. write sync non-muxed + +adv_wr_off: + t_avdp_w +we_on, wr_data_mux_bus: + t_weasu, t_rdyo +we_off: + t_wpl, cyc_wpl +cs_wr_off: + t_wph +wr_cycle: + t_cez_w, t_ce_rdyz + + +Note: + Many of gpmc timings are dependent on other gpmc timings (a few + gpmc timings purely dependent on other gpmc timings, a reason that + some of the gpmc timings are missing above), and it will result in + indirect dependency of peripheral timings to gpmc timings other than + mentioned above, refer timing routine for more details. To know what + these peripheral timings correspond to, please see explanations in + struct gpmc_device_timings definition. And for gpmc timings refer + IP details (link above). diff --git a/Documentation/memory-devices/ti-emif.rst b/Documentation/memory-devices/ti-emif.rst deleted file mode 100644 index c9242294e63c..000000000000 --- a/Documentation/memory-devices/ti-emif.rst +++ /dev/null @@ -1,64 +0,0 @@ -:orphan: - -=============================== -TI EMIF SDRAM Controller Driver -=============================== - -Author -====== -Aneesh V - -Location -======== -driver/memory/emif.c - -Supported SoCs: -=============== -TI OMAP44xx -TI OMAP54xx - -Menuconfig option: -================== -Device Drivers - Memory devices - Texas Instruments EMIF driver - -Description -=========== -This driver is for the EMIF module available in Texas Instruments -SoCs. EMIF is an SDRAM controller that, based on its revision, -supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols. -This driver takes care of only LPDDR2 memories presently. The -functions of the driver includes re-configuring AC timing -parameters and other settings during frequency, voltage and -temperature changes - -Platform Data (see include/linux/platform_data/emif_plat.h) -=========================================================== -DDR device details and other board dependent and SoC dependent -information can be passed through platform data (struct emif_platform_data) - -- DDR device details: 'struct ddr_device_info' -- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck' -- Custom configurations: customizable policy options through - 'struct emif_custom_configs' -- IP revision -- PHY type - -Interface to the external world -=============================== -EMIF driver registers notifiers for voltage and frequency changes -affecting EMIF and takes appropriate actions when these are invoked. - -- freq_pre_notify_handling() -- freq_post_notify_handling() -- volt_notify_handling() - -Debugfs -======= -The driver creates two debugfs entries per device. - -- regcache_dump : dump of register values calculated and saved for all - frequencies used so far. -- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4 - indicates the current temperature level of the device. -- cgit From 7e042736faab9457dd754668b9db2a1113cd322b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 07:13:34 -0300 Subject: docs: add SPDX tags to new index files All those new files I added are under GPL v2.0 license. Add the corresponding SPDX headers to them. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/blockdev/drbd/figures.rst | 2 ++ Documentation/admin-guide/blockdev/index.rst | 2 ++ Documentation/admin-guide/laptops/index.rst | 1 + Documentation/admin-guide/namespaces/index.rst | 2 ++ Documentation/admin-guide/perf/index.rst | 2 ++ Documentation/arm/index.rst | 2 ++ Documentation/arm/nwfpe/index.rst | 2 ++ Documentation/arm/omap/index.rst | 2 ++ Documentation/arm/sa1100/index.rst | 2 ++ Documentation/arm/samsung-s3c24xx/index.rst | 2 ++ Documentation/arm/samsung/index.rst | 2 ++ Documentation/driver-api/early-userspace/index.rst | 2 ++ Documentation/driver-api/md/index.rst | 2 ++ Documentation/driver-api/memory-devices/index.rst | 2 ++ Documentation/driver-api/mmc/index.rst | 2 ++ Documentation/driver-api/mtd/index.rst | 2 ++ Documentation/driver-api/nfc/index.rst | 2 ++ Documentation/driver-api/nvdimm/index.rst | 2 ++ Documentation/driver-api/phy/index.rst | 2 ++ Documentation/driver-api/rapidio/index.rst | 2 ++ Documentation/ia64/index.rst | 2 ++ 21 files changed, 41 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst index 3e3fd4b8a478..bd9a4901fe46 100644 --- a/Documentation/admin-guide/blockdev/drbd/figures.rst +++ b/Documentation/admin-guide/blockdev/drbd/figures.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + .. The here included files are intended to help understand the implementation Data flows that Relate some functions, and write packets diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst index 20a738d9d047..b903cf152091 100644 --- a/Documentation/admin-guide/blockdev/index.rst +++ b/Documentation/admin-guide/blockdev/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =========================== The Linux RapidIO Subsystem =========================== diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst index 6b554e39863b..cd9a1c2695fd 100644 --- a/Documentation/admin-guide/laptops/index.rst +++ b/Documentation/admin-guide/laptops/index.rst @@ -1,3 +1,4 @@ +.. SPDX-License-Identifier: GPL-2.0 ============== Laptop Drivers diff --git a/Documentation/admin-guide/namespaces/index.rst b/Documentation/admin-guide/namespaces/index.rst index 713ec4949fa7..384f2e0f33d2 100644 --- a/Documentation/admin-guide/namespaces/index.rst +++ b/Documentation/admin-guide/namespaces/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ========== Namespaces ========== diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 9d445451ea18..ee4bfd2a740f 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =========================== Performance monitor support =========================== diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst index 9c2f781f4685..5fc072dd0c5e 100644 --- a/Documentation/arm/index.rst +++ b/Documentation/arm/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================ ARM Architecture ================ diff --git a/Documentation/arm/nwfpe/index.rst b/Documentation/arm/nwfpe/index.rst index 21fa8ce192ae..3c4d2f9aa10e 100644 --- a/Documentation/arm/nwfpe/index.rst +++ b/Documentation/arm/nwfpe/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =================================== NetWinder's floating point emulator =================================== diff --git a/Documentation/arm/omap/index.rst b/Documentation/arm/omap/index.rst index f1e9c11d9f9b..8b365b212e49 100644 --- a/Documentation/arm/omap/index.rst +++ b/Documentation/arm/omap/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ======= TI OMAP ======= diff --git a/Documentation/arm/sa1100/index.rst b/Documentation/arm/sa1100/index.rst index fb2385b3accf..68c2a280a745 100644 --- a/Documentation/arm/sa1100/index.rst +++ b/Documentation/arm/sa1100/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ==================== Intel StrongARM 1100 ==================== diff --git a/Documentation/arm/samsung-s3c24xx/index.rst b/Documentation/arm/samsung-s3c24xx/index.rst index 6c7b241cbf37..5b8a7f9398d8 100644 --- a/Documentation/arm/samsung-s3c24xx/index.rst +++ b/Documentation/arm/samsung-s3c24xx/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ========================== Samsung S3C24XX SoC Family ========================== diff --git a/Documentation/arm/samsung/index.rst b/Documentation/arm/samsung/index.rst index f54d95734362..8142cce3d23e 100644 --- a/Documentation/arm/samsung/index.rst +++ b/Documentation/arm/samsung/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =========== Samsung SoC =========== diff --git a/Documentation/driver-api/early-userspace/index.rst b/Documentation/driver-api/early-userspace/index.rst index 6f20c3c560d8..149c1822f06d 100644 --- a/Documentation/driver-api/early-userspace/index.rst +++ b/Documentation/driver-api/early-userspace/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =============== Early Userspace =============== diff --git a/Documentation/driver-api/md/index.rst b/Documentation/driver-api/md/index.rst index 205080891a1a..18f54a7d7d6e 100644 --- a/Documentation/driver-api/md/index.rst +++ b/Documentation/driver-api/md/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ==== RAID ==== diff --git a/Documentation/driver-api/memory-devices/index.rst b/Documentation/driver-api/memory-devices/index.rst index 87549828f6ab..28101458cda5 100644 --- a/Documentation/driver-api/memory-devices/index.rst +++ b/Documentation/driver-api/memory-devices/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ========================= Memory Controller drivers ========================= diff --git a/Documentation/driver-api/mmc/index.rst b/Documentation/driver-api/mmc/index.rst index 9aaf64951a8c..7339736ac774 100644 --- a/Documentation/driver-api/mmc/index.rst +++ b/Documentation/driver-api/mmc/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ======================== MMC/SD/SDIO card support ======================== diff --git a/Documentation/driver-api/mtd/index.rst b/Documentation/driver-api/mtd/index.rst index 2e0e7cc4055e..436ba5a851d7 100644 --- a/Documentation/driver-api/mtd/index.rst +++ b/Documentation/driver-api/mtd/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ============================== Memory Technology Device (MTD) ============================== diff --git a/Documentation/driver-api/nfc/index.rst b/Documentation/driver-api/nfc/index.rst index 3afb2c0c2e3c..b6e9eedbff29 100644 --- a/Documentation/driver-api/nfc/index.rst +++ b/Documentation/driver-api/nfc/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ======================== Near Field Communication ======================== diff --git a/Documentation/driver-api/nvdimm/index.rst b/Documentation/driver-api/nvdimm/index.rst index 19dc8ee371dc..a4f8f98aeb94 100644 --- a/Documentation/driver-api/nvdimm/index.rst +++ b/Documentation/driver-api/nvdimm/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =================================== Non-Volatile Memory Device (NVDIMM) =================================== diff --git a/Documentation/driver-api/phy/index.rst b/Documentation/driver-api/phy/index.rst index fce9ffae2812..69ba1216de72 100644 --- a/Documentation/driver-api/phy/index.rst +++ b/Documentation/driver-api/phy/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ===================== Generic PHY Framework ===================== diff --git a/Documentation/driver-api/rapidio/index.rst b/Documentation/driver-api/rapidio/index.rst index 4c5e51a05134..a41b4242d16f 100644 --- a/Documentation/driver-api/rapidio/index.rst +++ b/Documentation/driver-api/rapidio/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =========================== The Linux RapidIO Subsystem =========================== diff --git a/Documentation/ia64/index.rst b/Documentation/ia64/index.rst index ef99475f672b..0436e1034115 100644 --- a/Documentation/ia64/index.rst +++ b/Documentation/ia64/index.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================== IA-64 Architecture ================== -- cgit From 113094f743fc97559c068ad20fd2808b64f6989d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 08:36:50 -0300 Subject: docs: add some directories to the main documentation index The contents of those directories were orphaned at the documentation body. While those directories could likely be moved to be inside some guide, I'm opting to just adding their indexes to the main one, removing the :orphan: and adding the SPDX header. For the drivers, the rationale is that the documentation contains a mix of Kernelspace, uAPI and admin-guide. So, better to keep them on separate directories, as we've be doing with similar subsystem-specific docs that were not split yet. For the others, well... I'm too lazy to do the move. Also, it seems to make sense to keep at least some of those at the main dir (like kbuild, for example). In any case, a latter patch could do the move. Signed-off-by: Mauro Carvalho Chehab Acked-by: Bartlomiej Zolnierkiewicz --- Documentation/cdrom/index.rst | 2 +- Documentation/fault-injection/index.rst | 2 +- Documentation/fb/index.rst | 2 +- Documentation/fpga/index.rst | 2 +- Documentation/ide/index.rst | 2 +- Documentation/index.rst | 13 +++++++++++++ Documentation/kbuild/index.rst | 2 +- Documentation/livepatch/index.rst | 2 +- Documentation/netlabel/index.rst | 2 +- Documentation/pcmcia/index.rst | 2 +- Documentation/target/index.rst | 2 +- Documentation/timers/index.rst | 2 +- Documentation/watchdog/index.rst | 2 +- 13 files changed, 25 insertions(+), 12 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst index efbd5d111825..338ad5f94e7c 100644 --- a/Documentation/cdrom/index.rst +++ b/Documentation/cdrom/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ===== cdrom diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst index 92b5639ed07a..8408a8a91b34 100644 --- a/Documentation/fault-injection/index.rst +++ b/Documentation/fault-injection/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 =============== fault-injection diff --git a/Documentation/fb/index.rst b/Documentation/fb/index.rst index d47313714635..baf02393d8ee 100644 --- a/Documentation/fb/index.rst +++ b/Documentation/fb/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ============ Frame Buffer diff --git a/Documentation/fpga/index.rst b/Documentation/fpga/index.rst index 2c87d1ea084f..f80f95667ca2 100644 --- a/Documentation/fpga/index.rst +++ b/Documentation/fpga/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ==== fpga diff --git a/Documentation/ide/index.rst b/Documentation/ide/index.rst index 45bc12d3957f..813dfe611a31 100644 --- a/Documentation/ide/index.rst +++ b/Documentation/ide/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ================================== Integrated Drive Electronics (IDE) diff --git a/Documentation/index.rst b/Documentation/index.rst index dbfec00ba535..0cd4c3901456 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -35,6 +35,7 @@ trying to get it to work optimally on a given system. :maxdepth: 2 admin-guide/index + kbuild/index Firmware-related documentation ------------------------------ @@ -77,6 +78,9 @@ merged much easier. kernel-hacking/index trace/index maintainer/index + fault-injection/index + livepatch/index + Kernel API documentation ------------------------ @@ -94,11 +98,20 @@ needed). core-api/index accounting/index block/index + cdrom/index + ide/index + fb/index + fpga/index hid/index iio/index leds/index media/index + netlabel/index networking/index + pcmcia/index + target/index + timers/index + watchdog/index input/index hwmon/index gpu/index diff --git a/Documentation/kbuild/index.rst b/Documentation/kbuild/index.rst index 42d4cbe4460c..e323a3f2cc81 100644 --- a/Documentation/kbuild/index.rst +++ b/Documentation/kbuild/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 =================== Kernel Build System diff --git a/Documentation/livepatch/index.rst b/Documentation/livepatch/index.rst index edd291d51847..17674a9e21b2 100644 --- a/Documentation/livepatch/index.rst +++ b/Documentation/livepatch/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 =================== Kernel Livepatching diff --git a/Documentation/netlabel/index.rst b/Documentation/netlabel/index.rst index 47f1e0e5acd1..984e1b191b12 100644 --- a/Documentation/netlabel/index.rst +++ b/Documentation/netlabel/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ======== NetLabel diff --git a/Documentation/pcmcia/index.rst b/Documentation/pcmcia/index.rst index 779c8527109e..7ae1f62fca14 100644 --- a/Documentation/pcmcia/index.rst +++ b/Documentation/pcmcia/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ====== pcmcia diff --git a/Documentation/target/index.rst b/Documentation/target/index.rst index b68f48982392..4b24f81f747e 100644 --- a/Documentation/target/index.rst +++ b/Documentation/target/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ================== TCM Virtual Device diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst index 91f6f8263c48..df510ad0c989 100644 --- a/Documentation/timers/index.rst +++ b/Documentation/timers/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ====== timers diff --git a/Documentation/watchdog/index.rst b/Documentation/watchdog/index.rst index 33a0de631e84..c177645081d8 100644 --- a/Documentation/watchdog/index.rst +++ b/Documentation/watchdog/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ====================== Linux Watchdog Support -- cgit From 4c68060bf6d3eac6e86b995a200eb21b847236da Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 07:29:15 -0300 Subject: docs: locking: add it to the main index The locking directory is part of the Kernel API bookset. Add it to the index file. Signed-off-by: Mauro Carvalho Chehab --- Documentation/index.rst | 1 + Documentation/locking/index.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/index.rst b/Documentation/index.rst index 0cd4c3901456..eb5db850c4ef 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -96,6 +96,7 @@ needed). driver-api/index core-api/index + locking/index accounting/index block/index cdrom/index diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst index ef5da7fe9aac..626a463f7e42 100644 --- a/Documentation/locking/index.rst +++ b/Documentation/locking/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ======= locking -- cgit From c2746a1eb741759590e8766958232d06a71840d5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 08:14:42 -0300 Subject: docs: gpio: add sysfs interface to the admin-guide While this is stated as obsoleted, the sysfs interface described there is still valid, and belongs to the admin-guide. Signed-off-by: Mauro Carvalho Chehab Acked-by: Linus Walleij --- Documentation/ABI/obsolete/sysfs-gpio | 2 +- Documentation/admin-guide/gpio/index.rst | 17 +++ Documentation/admin-guide/gpio/sysfs.rst | 167 ++++++++++++++++++++++ Documentation/admin-guide/index.rst | 1 + Documentation/firmware-guide/acpi/enumeration.rst | 2 +- Documentation/gpio/index.rst | 17 --- Documentation/gpio/sysfs.rst | 167 ---------------------- Documentation/translations/zh_CN/gpio.txt | 4 +- 8 files changed, 189 insertions(+), 188 deletions(-) create mode 100644 Documentation/admin-guide/gpio/index.rst create mode 100644 Documentation/admin-guide/gpio/sysfs.rst delete mode 100644 Documentation/gpio/index.rst delete mode 100644 Documentation/gpio/sysfs.rst (limited to 'Documentation') diff --git a/Documentation/ABI/obsolete/sysfs-gpio b/Documentation/ABI/obsolete/sysfs-gpio index 40d41ea1a3f5..e0d4e5e2dd90 100644 --- a/Documentation/ABI/obsolete/sysfs-gpio +++ b/Documentation/ABI/obsolete/sysfs-gpio @@ -11,7 +11,7 @@ Description: Kernel code may export it for complete or partial access. GPIOs are identified as they are inside the kernel, using integers in - the range 0..INT_MAX. See Documentation/gpio for more information. + the range 0..INT_MAX. See Documentation/admin-guide/gpio for more information. /sys/class/gpio /export ... asks the kernel to export a GPIO to userspace diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst new file mode 100644 index 000000000000..a244ba4e87d5 --- /dev/null +++ b/Documentation/admin-guide/gpio/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +gpio +==== + +.. toctree:: + :maxdepth: 1 + + sysfs + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst new file mode 100644 index 000000000000..ec09ffd983e7 --- /dev/null +++ b/Documentation/admin-guide/gpio/sysfs.rst @@ -0,0 +1,167 @@ +GPIO Sysfs Interface for Userspace +================================== + +.. warning:: + + THIS ABI IS DEPRECATED, THE ABI DOCUMENTATION HAS BEEN MOVED TO + Documentation/ABI/obsolete/sysfs-gpio AND NEW USERSPACE CONSUMERS + ARE SUPPOSED TO USE THE CHARACTER DEVICE ABI. THIS OLD SYSFS ABI WILL + NOT BE DEVELOPED (NO NEW FEATURES), IT WILL JUST BE MAINTAINED. + +Refer to the examples in tools/gpio/* for an introduction to the new +character device ABI. Also see the userspace header in +include/uapi/linux/gpio.h + +The deprecated sysfs ABI +------------------------ +Platforms which use the "gpiolib" implementors framework may choose to +configure a sysfs user interface to GPIOs. This is different from the +debugfs interface, since it provides control over GPIO direction and +value instead of just showing a gpio state summary. Plus, it could be +present on production systems without debugging support. + +Given appropriate hardware documentation for the system, userspace could +know for example that GPIO #23 controls the write protect line used to +protect boot loader segments in flash memory. System upgrade procedures +may need to temporarily remove that protection, first importing a GPIO, +then changing its output state, then updating the code before re-enabling +the write protection. In normal use, GPIO #23 would never be touched, +and the kernel would have no need to know about it. + +Again depending on appropriate hardware documentation, on some systems +userspace GPIO can be used to determine system configuration data that +standard kernels won't know about. And for some tasks, simple userspace +GPIO drivers could be all that the system really needs. + +DO NOT ABUSE SYSFS TO CONTROL HARDWARE THAT HAS PROPER KERNEL DRIVERS. +PLEASE READ THE DOCUMENT AT Documentation/driver-api/gpio/drivers-on-gpio.rst +TO AVOID REINVENTING KERNEL WHEELS IN USERSPACE. I MEAN IT. REALLY. + +Paths in Sysfs +-------------- +There are three kinds of entries in /sys/class/gpio: + + - Control interfaces used to get userspace control over GPIOs; + + - GPIOs themselves; and + + - GPIO controllers ("gpio_chip" instances). + +That's in addition to standard files including the "device" symlink. + +The control interfaces are write-only: + + /sys/class/gpio/ + + "export" ... + Userspace may ask the kernel to export control of + a GPIO to userspace by writing its number to this file. + + Example: "echo 19 > export" will create a "gpio19" node + for GPIO #19, if that's not requested by kernel code. + + "unexport" ... + Reverses the effect of exporting to userspace. + + Example: "echo 19 > unexport" will remove a "gpio19" + node exported using the "export" file. + +GPIO signals have paths like /sys/class/gpio/gpio42/ (for GPIO #42) +and have the following read/write attributes: + + /sys/class/gpio/gpioN/ + + "direction" ... + reads as either "in" or "out". This value may + normally be written. Writing as "out" defaults to + initializing the value as low. To ensure glitch free + operation, values "low" and "high" may be written to + configure the GPIO as an output with that initial value. + + Note that this attribute *will not exist* if the kernel + doesn't support changing the direction of a GPIO, or + it was exported by kernel code that didn't explicitly + allow userspace to reconfigure this GPIO's direction. + + "value" ... + reads as either 0 (low) or 1 (high). If the GPIO + is configured as an output, this value may be written; + any nonzero value is treated as high. + + If the pin can be configured as interrupt-generating interrupt + and if it has been configured to generate interrupts (see the + description of "edge"), you can poll(2) on that file and + poll(2) will return whenever the interrupt was triggered. If + you use poll(2), set the events POLLPRI and POLLERR. If you + use select(2), set the file descriptor in exceptfds. After + poll(2) returns, either lseek(2) to the beginning of the sysfs + file and read the new value or close the file and re-open it + to read the value. + + "edge" ... + reads as either "none", "rising", "falling", or + "both". Write these strings to select the signal edge(s) + that will make poll(2) on the "value" file return. + + This file exists only if the pin can be configured as an + interrupt generating input pin. + + "active_low" ... + reads as either 0 (false) or 1 (true). Write + any nonzero value to invert the value attribute both + for reading and writing. Existing and subsequent + poll(2) support configuration via the edge attribute + for "rising" and "falling" edges will follow this + setting. + +GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the +controller implementing GPIOs starting at #42) and have the following +read-only attributes: + + /sys/class/gpio/gpiochipN/ + + "base" ... + same as N, the first GPIO managed by this chip + + "label" ... + provided for diagnostics (not always unique) + + "ngpio" ... + how many GPIOs this manages (N to N + ngpio - 1) + +Board documentation should in most cases cover what GPIOs are used for +what purposes. However, those numbers are not always stable; GPIOs on +a daughtercard might be different depending on the base board being used, +or other cards in the stack. In such cases, you may need to use the +gpiochip nodes (possibly in conjunction with schematics) to determine +the correct GPIO number to use for a given signal. + + +Exporting from Kernel code +-------------------------- +Kernel code can explicitly manage exports of GPIOs which have already been +requested using gpio_request():: + + /* export the GPIO to userspace */ + int gpiod_export(struct gpio_desc *desc, bool direction_may_change); + + /* reverse gpio_export() */ + void gpiod_unexport(struct gpio_desc *desc); + + /* create a sysfs link to an exported GPIO node */ + int gpiod_export_link(struct device *dev, const char *name, + struct gpio_desc *desc); + +After a kernel driver requests a GPIO, it may only be made available in +the sysfs interface by gpiod_export(). The driver can control whether the +signal direction may change. This helps drivers prevent userspace code +from accidentally clobbering important system state. + +This explicit exporting can help with debugging (by making some kinds +of experiments easier), or can provide an always-there interface that's +suitable for documenting as part of a board support package. + +After the GPIO has been exported, gpiod_export_link() allows creating +symlinks from elsewhere in sysfs to the GPIO sysfs node. Drivers can +use this to provide the interface under their own device in sysfs with +a descriptive name. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 4e98f5596da0..280355d08af5 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -91,6 +91,7 @@ configure specific aspects of kernel behavior to your liking. cputopology device-mapper/index efi-stub + gpio/index highuid hw_random iostats diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst index 1252617b520f..0a72b6321f5f 100644 --- a/Documentation/firmware-guide/acpi/enumeration.rst +++ b/Documentation/firmware-guide/acpi/enumeration.rst @@ -316,7 +316,7 @@ specifies the path to the controller. In order to use these GPIOs in Linux we need to translate them to the corresponding Linux GPIO descriptors. There is a standard GPIO API for that and is documented in -Documentation/gpio/. +Documentation/admin-guide/gpio/. In the above example we can get the corresponding two GPIO descriptors with a code like this:: diff --git a/Documentation/gpio/index.rst b/Documentation/gpio/index.rst deleted file mode 100644 index 09a4a553f434..000000000000 --- a/Documentation/gpio/index.rst +++ /dev/null @@ -1,17 +0,0 @@ -:orphan: - -==== -gpio -==== - -.. toctree:: - :maxdepth: 1 - - sysfs - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/gpio/sysfs.rst b/Documentation/gpio/sysfs.rst deleted file mode 100644 index ec09ffd983e7..000000000000 --- a/Documentation/gpio/sysfs.rst +++ /dev/null @@ -1,167 +0,0 @@ -GPIO Sysfs Interface for Userspace -================================== - -.. warning:: - - THIS ABI IS DEPRECATED, THE ABI DOCUMENTATION HAS BEEN MOVED TO - Documentation/ABI/obsolete/sysfs-gpio AND NEW USERSPACE CONSUMERS - ARE SUPPOSED TO USE THE CHARACTER DEVICE ABI. THIS OLD SYSFS ABI WILL - NOT BE DEVELOPED (NO NEW FEATURES), IT WILL JUST BE MAINTAINED. - -Refer to the examples in tools/gpio/* for an introduction to the new -character device ABI. Also see the userspace header in -include/uapi/linux/gpio.h - -The deprecated sysfs ABI ------------------------- -Platforms which use the "gpiolib" implementors framework may choose to -configure a sysfs user interface to GPIOs. This is different from the -debugfs interface, since it provides control over GPIO direction and -value instead of just showing a gpio state summary. Plus, it could be -present on production systems without debugging support. - -Given appropriate hardware documentation for the system, userspace could -know for example that GPIO #23 controls the write protect line used to -protect boot loader segments in flash memory. System upgrade procedures -may need to temporarily remove that protection, first importing a GPIO, -then changing its output state, then updating the code before re-enabling -the write protection. In normal use, GPIO #23 would never be touched, -and the kernel would have no need to know about it. - -Again depending on appropriate hardware documentation, on some systems -userspace GPIO can be used to determine system configuration data that -standard kernels won't know about. And for some tasks, simple userspace -GPIO drivers could be all that the system really needs. - -DO NOT ABUSE SYSFS TO CONTROL HARDWARE THAT HAS PROPER KERNEL DRIVERS. -PLEASE READ THE DOCUMENT AT Documentation/driver-api/gpio/drivers-on-gpio.rst -TO AVOID REINVENTING KERNEL WHEELS IN USERSPACE. I MEAN IT. REALLY. - -Paths in Sysfs --------------- -There are three kinds of entries in /sys/class/gpio: - - - Control interfaces used to get userspace control over GPIOs; - - - GPIOs themselves; and - - - GPIO controllers ("gpio_chip" instances). - -That's in addition to standard files including the "device" symlink. - -The control interfaces are write-only: - - /sys/class/gpio/ - - "export" ... - Userspace may ask the kernel to export control of - a GPIO to userspace by writing its number to this file. - - Example: "echo 19 > export" will create a "gpio19" node - for GPIO #19, if that's not requested by kernel code. - - "unexport" ... - Reverses the effect of exporting to userspace. - - Example: "echo 19 > unexport" will remove a "gpio19" - node exported using the "export" file. - -GPIO signals have paths like /sys/class/gpio/gpio42/ (for GPIO #42) -and have the following read/write attributes: - - /sys/class/gpio/gpioN/ - - "direction" ... - reads as either "in" or "out". This value may - normally be written. Writing as "out" defaults to - initializing the value as low. To ensure glitch free - operation, values "low" and "high" may be written to - configure the GPIO as an output with that initial value. - - Note that this attribute *will not exist* if the kernel - doesn't support changing the direction of a GPIO, or - it was exported by kernel code that didn't explicitly - allow userspace to reconfigure this GPIO's direction. - - "value" ... - reads as either 0 (low) or 1 (high). If the GPIO - is configured as an output, this value may be written; - any nonzero value is treated as high. - - If the pin can be configured as interrupt-generating interrupt - and if it has been configured to generate interrupts (see the - description of "edge"), you can poll(2) on that file and - poll(2) will return whenever the interrupt was triggered. If - you use poll(2), set the events POLLPRI and POLLERR. If you - use select(2), set the file descriptor in exceptfds. After - poll(2) returns, either lseek(2) to the beginning of the sysfs - file and read the new value or close the file and re-open it - to read the value. - - "edge" ... - reads as either "none", "rising", "falling", or - "both". Write these strings to select the signal edge(s) - that will make poll(2) on the "value" file return. - - This file exists only if the pin can be configured as an - interrupt generating input pin. - - "active_low" ... - reads as either 0 (false) or 1 (true). Write - any nonzero value to invert the value attribute both - for reading and writing. Existing and subsequent - poll(2) support configuration via the edge attribute - for "rising" and "falling" edges will follow this - setting. - -GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the -controller implementing GPIOs starting at #42) and have the following -read-only attributes: - - /sys/class/gpio/gpiochipN/ - - "base" ... - same as N, the first GPIO managed by this chip - - "label" ... - provided for diagnostics (not always unique) - - "ngpio" ... - how many GPIOs this manages (N to N + ngpio - 1) - -Board documentation should in most cases cover what GPIOs are used for -what purposes. However, those numbers are not always stable; GPIOs on -a daughtercard might be different depending on the base board being used, -or other cards in the stack. In such cases, you may need to use the -gpiochip nodes (possibly in conjunction with schematics) to determine -the correct GPIO number to use for a given signal. - - -Exporting from Kernel code --------------------------- -Kernel code can explicitly manage exports of GPIOs which have already been -requested using gpio_request():: - - /* export the GPIO to userspace */ - int gpiod_export(struct gpio_desc *desc, bool direction_may_change); - - /* reverse gpio_export() */ - void gpiod_unexport(struct gpio_desc *desc); - - /* create a sysfs link to an exported GPIO node */ - int gpiod_export_link(struct device *dev, const char *name, - struct gpio_desc *desc); - -After a kernel driver requests a GPIO, it may only be made available in -the sysfs interface by gpiod_export(). The driver can control whether the -signal direction may change. This helps drivers prevent userspace code -from accidentally clobbering important system state. - -This explicit exporting can help with debugging (by making some kinds -of experiments easier), or can provide an always-there interface that's -suitable for documenting as part of a board support package. - -After the GPIO has been exported, gpiod_export_link() allows creating -symlinks from elsewhere in sysfs to the GPIO sysfs node. Drivers can -use this to provide the interface under their own device in sysfs with -a descriptive name. diff --git a/Documentation/translations/zh_CN/gpio.txt b/Documentation/translations/zh_CN/gpio.txt index 4cb1ba8b8fed..a23ee14fc927 100644 --- a/Documentation/translations/zh_CN/gpio.txt +++ b/Documentation/translations/zh_CN/gpio.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/gpio +Chinese translated version of Documentation/admin-guide/gpio If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Grant Likely Linus Walleij Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/gpio 的中文翻译 +Documentation/admin-guide/gpio 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 -- cgit From eddeed127b06ea2542dc18f2fe37d383b6369fec Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 6 Jul 2019 13:38:56 -0300 Subject: docs: don't use nested tables Nested tables aren't supported for pdf output on Sphinx 1.7.9: admin-guide/laptops/sonypi:: nested tables are not yet implemented. admin-guide/laptops/toshiba_haps:: nested tables are not yet implemented. driver-api/nvdimm/btt:: nested tables are not yet implemented. s390/debugging390:: nested tables are not yet implemented. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andy Shevchenko # laptops --- Documentation/admin-guide/laptops/sonypi.rst | 28 ++++++++++------------ Documentation/admin-guide/laptops/toshiba_haps.rst | 8 +++---- Documentation/driver-api/nvdimm/btt.rst | 2 +- Documentation/s390/debugging390.rst | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/laptops/sonypi.rst b/Documentation/admin-guide/laptops/sonypi.rst index 2a1975ed7ee4..c6eaaf48f7c1 100644 --- a/Documentation/admin-guide/laptops/sonypi.rst +++ b/Documentation/admin-guide/laptops/sonypi.rst @@ -53,7 +53,7 @@ module or sonypi.= on the kernel boot line when sonypi is statically linked into the kernel). Those options are: =============== ======================================================= - minor: minor number of the misc device /dev/sonypi, + minor: minor number of the misc device /dev/sonypi, default is -1 (automatic allocation, see /proc/misc or kernel logs) @@ -89,24 +89,22 @@ statically linked into the kernel). Those options are: set to 0xffffffff, meaning that all possible events will be tried. You can use the following bits to construct your own event mask (from - drivers/char/sonypi.h): - - ======================== ====== - SONYPI_JOGGER_MASK 0x0001 - SONYPI_CAPTURE_MASK 0x0002 - SONYPI_FNKEY_MASK 0x0004 - SONYPI_BLUETOOTH_MASK 0x0008 - SONYPI_PKEY_MASK 0x0010 - SONYPI_BACK_MASK 0x0020 - SONYPI_HELP_MASK 0x0040 - SONYPI_LID_MASK 0x0080 - SONYPI_ZOOM_MASK 0x0100 - SONYPI_THUMBPHRASE_MASK 0x0200 + drivers/char/sonypi.h):: + + SONYPI_JOGGER_MASK 0x0001 + SONYPI_CAPTURE_MASK 0x0002 + SONYPI_FNKEY_MASK 0x0004 + SONYPI_BLUETOOTH_MASK 0x0008 + SONYPI_PKEY_MASK 0x0010 + SONYPI_BACK_MASK 0x0020 + SONYPI_HELP_MASK 0x0040 + SONYPI_LID_MASK 0x0080 + SONYPI_ZOOM_MASK 0x0100 + SONYPI_THUMBPHRASE_MASK 0x0200 SONYPI_MEYE_MASK 0x0400 SONYPI_MEMORYSTICK_MASK 0x0800 SONYPI_BATTERY_MASK 0x1000 SONYPI_WIRELESS_MASK 0x2000 - ======================== ====== useinput: if set (which is the default) two input devices are created, one which interprets the jogdial events as diff --git a/Documentation/admin-guide/laptops/toshiba_haps.rst b/Documentation/admin-guide/laptops/toshiba_haps.rst index 11dfc428c080..d28b6c3f2849 100644 --- a/Documentation/admin-guide/laptops/toshiba_haps.rst +++ b/Documentation/admin-guide/laptops/toshiba_haps.rst @@ -75,11 +75,11 @@ The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are: protection_level The protection_level is readable and writeable, and provides a way to let userspace query the current protection level, as well as set the desired protection level, the - available protection levels are: + available protection levels are:: - ============ ======= ========== ======== - 0 - Disabled 1 - Low 2 - Medium 3 - High - ============ ======= ========== ======== + ============ ======= ========== ======== + 0 - Disabled 1 - Low 2 - Medium 3 - High + ============ ======= ========== ======== reset_protection The reset_protection entry is writeable only, being "1" the only parameter it accepts, it is used to trigger diff --git a/Documentation/driver-api/nvdimm/btt.rst b/Documentation/driver-api/nvdimm/btt.rst index 2d8269f834bd..107395c042ae 100644 --- a/Documentation/driver-api/nvdimm/btt.rst +++ b/Documentation/driver-api/nvdimm/btt.rst @@ -83,7 +83,7 @@ flags, and the remaining form the internal block number. ======== ============================================================= Bit Description ======== ============================================================= -31 - 30 Error and Zero flags - Used in the following way: +31 - 30 Error and Zero flags - Used in the following way:: == == ==================================================== 31 30 Description diff --git a/Documentation/s390/debugging390.rst b/Documentation/s390/debugging390.rst index d49305fd5e1a..73ad0b06c666 100644 --- a/Documentation/s390/debugging390.rst +++ b/Documentation/s390/debugging390.rst @@ -170,7 +170,7 @@ currently running at. | +----------------+-------------------------------------------------+ | | 32 | Basic Addressing Mode | | | | | -| | | Used to set addressing mode | +| | | Used to set addressing mode:: | | | | | | | | +---------+----------+----------+ | | | | | PSW 31 | PSW 32 | | | -- cgit From 38cbfed28b3178dd9004064b1a72a992a3b31969 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 9 Jul 2019 12:22:41 -0300 Subject: docs: arm: fix a breakage with pdf output Add an extra blank line, as otherwise XeLaTex will complain with: ! LaTeX Error: Too deeply nested. Signed-off-by: Mauro Carvalho Chehab --- Documentation/arm/spear/overview.rst | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/arm/spear/overview.rst b/Documentation/arm/spear/overview.rst index 8a1a87aca427..1a77f6b213b6 100644 --- a/Documentation/arm/spear/overview.rst +++ b/Documentation/arm/spear/overview.rst @@ -15,6 +15,7 @@ Introduction Hierarchy in SPEAr is as follows: SPEAr (Platform) + - SPEAr3XX (3XX SOC series, based on ARM9) - SPEAr300 (SOC) - SPEAr300 Evaluation Board -- cgit From 8bb0776b8b27d548c7e65828ec3a02cb31fe3eed Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 9 Jul 2019 12:36:09 -0300 Subject: docs: block: fix pdf output Add an extra blank line and use a markup for the enumberated list, in order to make it possible to build the block book on pdf format. Signed-off-by: Mauro Carvalho Chehab --- Documentation/block/biodoc.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst index d6e30b680405..c8eb28401332 100644 --- a/Documentation/block/biodoc.rst +++ b/Documentation/block/biodoc.rst @@ -9,6 +9,7 @@ Notes on the Generic Block Layer Rewrite in Linux 2.5 here might still be useful. Notes Written on Jan 15, 2002: + - Jens Axboe - Suparna Bhattacharya @@ -172,8 +173,8 @@ Some new queue property settings: New queue flags: - QUEUE_FLAG_CLUSTER (see 3.2.2) - QUEUE_FLAG_QUEUED (see 3.2.4) + - QUEUE_FLAG_CLUSTER (see 3.2.2) + - QUEUE_FLAG_QUEUED (see 3.2.4) ii. High-mem i/o capabilities are now considered the default @@ -478,7 +479,7 @@ With this multipage bio design: - Splitting of an i/o request across multiple devices (as in the case of lvm or raid) is achieved by cloning the bio (where the clone points to the same bi_io_vec array, but with the index and size accordingly modified) -- A linked list of bios is used as before for unrelated merges [*]_ - this +- A linked list of bios is used as before for unrelated merges [#]_ - this avoids reallocs and makes independent completions easier to handle. - Code that traverses the req list can find all the segments of a bio by using rq_for_each_segment. This handles the fact that a request @@ -489,7 +490,7 @@ With this multipage bio design: [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying bi_offset an len fields] -.. [*] +.. [#] unrelated merges -- a request ends up containing two or more bios that didn't originate from the same place. -- cgit From 168869492e7009b6861b615f1d030c99bc805e83 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 9 Jul 2019 13:25:51 -0300 Subject: docs: kbuild: fix build with pdf and fix some minor issues The tag ".. include" should be replaced by ".. literalinclude" at issues.rst, otherwise it causes TeX to crash due to excessive usage of stack with Sphinx 2.0. While here, solve a few minor issues at the kbuild book output by adding extra blank lines. Signed-off-by: Mauro Carvalho Chehab --- Documentation/kbuild/issues.rst | 20 ++++++++++++-------- Documentation/kbuild/kbuild.rst | 3 ++- Documentation/kbuild/kconfig-language.rst | 12 ++++++++++++ Documentation/kbuild/kconfig.rst | 8 ++++++-- Documentation/kbuild/makefiles.rst | 1 + 5 files changed, 33 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kbuild/issues.rst b/Documentation/kbuild/issues.rst index 9fdded4b681c..bdab01f733f6 100644 --- a/Documentation/kbuild/issues.rst +++ b/Documentation/kbuild/issues.rst @@ -1,11 +1,15 @@ -Recursion issue #1 ------------------- +================ +Recursion issues +================ - .. include:: Kconfig.recursion-issue-01 - :literal: +issue #1 +-------- -Recursion issue #2 ------------------- +.. literalinclude:: Kconfig.recursion-issue-01 + :language: kconfig - .. include:: Kconfig.recursion-issue-02 - :literal: +issue #2 +-------- + +.. literalinclude:: Kconfig.recursion-issue-02 + :language: kconfig diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index b25548963d70..ce9b99c004ae 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -18,7 +18,7 @@ This file lists all modules that are built into the kernel. This is used by modprobe to not fail when trying to load something builtin. modules.builtin.modinfo --------------------------------------------------- +----------------------- This file contains modinfo from all modules that are built into the kernel. Unlike modinfo of a separate module, all fields are prefixed with module name. @@ -153,6 +153,7 @@ Install script called when using "make install". The default name is "installkernel". The script will be called with the following arguments: + - $1 - kernel version - $2 - kernel image file - $3 - kernel map file diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index 2bc8a7803365..74bef19f69f0 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -53,6 +53,7 @@ A menu entry can have a number of attributes. Not all of them are applicable everywhere (see syntax). - type definition: "bool"/"tristate"/"string"/"hex"/"int" + Every config option must have a type. There are only two basic types: tristate and string; the other types are based on these two. The type definition optionally accepts an input prompt, so these two examples @@ -66,11 +67,13 @@ applicable everywhere (see syntax). prompt "Networking support" - input prompt: "prompt" ["if" ] + Every menu entry can have at most one prompt, which is used to display to the user. Optionally dependencies only for this prompt can be added with "if". - default value: "default" ["if" ] + A config option can have any number of default values. If multiple default values are visible, only the first defined one is active. Default values are not limited to the menu entry where they are @@ -112,6 +115,7 @@ applicable everywhere (see syntax). Optionally dependencies for this default value can be added with "if". - dependencies: "depends on" + This defines a dependency for this menu entry. If multiple dependencies are defined, they are connected with '&&'. Dependencies are applied to all other options within this menu entry (which also @@ -127,6 +131,7 @@ applicable everywhere (see syntax). default y - reverse dependencies: "select" ["if" ] + While normal dependencies reduce the upper limit of a symbol (see below), reverse dependencies can be used to force a lower limit of another symbol. The value of the current menu symbol is used as the @@ -146,6 +151,7 @@ applicable everywhere (see syntax). the illegal configurations all over. - weak reverse dependencies: "imply" ["if" ] + This is similar to "select" as it enforces a lower limit on another symbol except that the "implied" symbol's value may still be set to n from a direct dependency or with a visible prompt. @@ -176,6 +182,7 @@ applicable everywhere (see syntax). configure that subsystem out without also having to unset these drivers. - limiting menu display: "visible if" + This attribute is only applicable to menu blocks, if the condition is false, the menu block is not displayed to the user (the symbols contained there can still be selected by other symbols, though). It is @@ -183,12 +190,14 @@ applicable everywhere (see syntax). entries. Default value of "visible" is true. - numerical ranges: "range" ["if" ] + This allows to limit the range of possible input values for int and hex symbols. The user can only input a value which is larger than or equal to the first symbol and smaller than or equal to the second symbol. - help text: "help" or "---help---" + This defines a help text. The end of the help text is determined by the indentation level, this means it ends at the first line which has a smaller indentation than the first line of the help text. @@ -197,6 +206,7 @@ applicable everywhere (see syntax). the file as an aid to developers. - misc options: "option" [=] + Various less common options can be defined via this option syntax, which can modify the behaviour of the menu entry and its config symbol. These options are currently possible: @@ -325,6 +335,7 @@ end a menu entry: The first five also start the definition of a menu entry. config:: + "config" @@ -332,6 +343,7 @@ This defines a config symbol and accepts any of above attributes as options. menuconfig:: + "menuconfig" diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst index 88129af7e539..a9a855f894b3 100644 --- a/Documentation/kbuild/kconfig.rst +++ b/Documentation/kbuild/kconfig.rst @@ -264,6 +264,7 @@ NCONFIG_MODE This mode shows all sub-menus in one large tree. Example:: + make NCONFIG_MODE=single_menu nconfig ---------------------------------------------------------------------- @@ -277,9 +278,12 @@ Searching in xconfig: names, so you have to know something close to what you are looking for. - Example: + Example:: + Ctrl-F hotplug - or + + or:: + Menu: File, Search, hotplug lists all config symbol entries that contain "hotplug" in diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 093f2d79ab95..f31158457753 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -384,6 +384,7 @@ more details, with real examples. ----------------------- Kbuild tracks dependencies on the following: + 1) All prerequisite files (both `*.c` and `*.h`) 2) `CONFIG_` options used in all prerequisite files 3) Command-line used to compile target -- cgit From 89b408a68b9dd163b2705b6f73d8e3cc3579b457 Mon Sep 17 00:00:00 2001 From: Sheriff Esseson Date: Mon, 15 Jul 2019 09:15:09 -0700 Subject: Documentation: filesystem: Convert xfs.txt to ReST Move xfs.txt to admin-guide, convert xfs.txt to ReST and broken references Signed-off-by: Sheriff Esseson Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/xfs.rst | 466 +++++++++++++++++++++++++++++++++++ Documentation/filesystems/dax.txt | 2 +- Documentation/filesystems/xfs.txt | 470 ------------------------------------ 4 files changed, 468 insertions(+), 471 deletions(-) create mode 100644 Documentation/admin-guide/xfs.rst delete mode 100644 Documentation/filesystems/xfs.txt (limited to 'Documentation') diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 8001917ee012..3c18ba7912a7 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -70,6 +70,7 @@ configure specific aspects of kernel behavior to your liking. ras bcache ext4 + xfs pm/index thunderbolt LSM/index diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst new file mode 100644 index 000000000000..e76665a8f2f2 --- /dev/null +++ b/Documentation/admin-guide/xfs.rst @@ -0,0 +1,466 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +The SGI XFS Filesystem +====================== + +XFS is a high performance journaling filesystem which originated +on the SGI IRIX platform. It is completely multi-threaded, can +support large files and large filesystems, extended attributes, +variable block sizes, is extent based, and makes extensive use of +Btrees (directories, extents, free space) to aid both performance +and scalability. + +Refer to the documentation at https://xfs.wiki.kernel.org/ +for further details. This implementation is on-disk compatible +with the IRIX version of XFS. + + +Mount Options +============= + +When mounting an XFS filesystem, the following options are accepted. + + allocsize=size + Sets the buffered I/O end-of-file preallocation size when + doing delayed allocation writeout (default size is 64KiB). + Valid values for this option are page size (typically 4KiB) + through to 1GiB, inclusive, in power-of-2 increments. + + The default behaviour is for dynamic end-of-file + preallocation size, which uses a set of heuristics to + optimise the preallocation size based on the current + allocation patterns within the file and the access patterns + to the file. Specifying a fixed ``allocsize`` value turns off + the dynamic behaviour. + + attr2 or noattr2 + The options enable/disable an "opportunistic" improvement to + be made in the way inline extended attributes are stored + on-disk. When the new form is used for the first time when + ``attr2`` is selected (either when setting or removing extended + attributes) the on-disk superblock feature bit field will be + updated to reflect this format being in use. + + The default behaviour is determined by the on-disk feature + bit indicating that ``attr2`` behaviour is active. If either + mount option is set, then that becomes the new default used + by the filesystem. + + CRC enabled filesystems always use the ``attr2`` format, and so + will reject the ``noattr2`` mount option if it is set. + + discard or nodiscard (default) + Enable/disable the issuing of commands to let the block + device reclaim space freed by the filesystem. This is + useful for SSD devices, thinly provisioned LUNs and virtual + machine images, but may have a performance impact. + + Note: It is currently recommended that you use the ``fstrim`` + application to ``discard`` unused blocks rather than the ``discard`` + mount option because the performance impact of this option + is quite severe. + + grpid/bsdgroups or nogrpid/sysvgroups (default) + These options define what group ID a newly created file + gets. When ``grpid`` is set, it takes the group ID of the + directory in which it is created; otherwise it takes the + ``fsgid`` of the current process, unless the directory has the + ``setgid`` bit set, in which case it takes the ``gid`` from the + parent directory, and also gets the ``setgid`` bit set if it is + a directory itself. + + filestreams + Make the data allocator use the filestreams allocation mode + across the entire filesystem rather than just on directories + configured to use it. + + ikeep or noikeep (default) + When ``ikeep`` is specified, XFS does not delete empty inode + clusters and keeps them around on disk. When ``noikeep`` is + specified, empty inode clusters are returned to the free + space pool. + + inode32 or inode64 (default) + When ``inode32`` is specified, it indicates that XFS limits + inode creation to locations which will not result in inode + numbers with more than 32 bits of significance. + + When ``inode64`` is specified, it indicates that XFS is allowed + to create inodes at any location in the filesystem, + including those which will result in inode numbers occupying + more than 32 bits of significance. + + ``inode32`` is provided for backwards compatibility with older + systems and applications, since 64 bits inode numbers might + cause problems for some applications that cannot handle + large inode numbers. If applications are in use which do + not handle inode numbers bigger than 32 bits, the ``inode32`` + option should be specified. + + largeio or nolargeio (default) + If ``nolargeio`` is specified, the optimal I/O reported in + ``st_blksize`` by **stat(2)** will be as small as possible to allow + user applications to avoid inefficient read/modify/write + I/O. This is typically the page size of the machine, as + this is the granularity of the page cache. + + If ``largeio`` is specified, a filesystem that was created with a + ``swidth`` specified will return the ``swidth`` value (in bytes) + in ``st_blksize``. If the filesystem does not have a ``swidth`` + specified but does specify an ``allocsize`` then ``allocsize`` + (in bytes) will be returned instead. Otherwise the behaviour + is the same as if ``nolargeio`` was specified. + + logbufs=value + Set the number of in-memory log buffers. Valid numbers + range from 2-8 inclusive. + + The default value is 8 buffers. + + If the memory cost of 8 log buffers is too high on small + systems, then it may be reduced at some cost to performance + on metadata intensive workloads. The ``logbsize`` option below + controls the size of each buffer and so is also relevant to + this case. + + logbsize=value + Set the size of each in-memory log buffer. The size may be + specified in bytes, or in kilobytes with a "k" suffix. + Valid sizes for version 1 and version 2 logs are 16384 (16k) + and 32768 (32k). Valid sizes for version 2 logs also + include 65536 (64k), 131072 (128k) and 262144 (256k). The + logbsize must be an integer multiple of the log + stripe unit configured at **mkfs(8)** time. + + The default value for for version 1 logs is 32768, while the + default value for version 2 logs is MAX(32768, log_sunit). + + logdev=device and rtdev=device + Use an external log (metadata journal) and/or real-time device. + An XFS filesystem has up to three parts: a data section, a log + section, and a real-time section. The real-time section is + optional, and the log section can be separate from the data + section or contained within it. + + noalign + Data allocations will not be aligned at stripe unit + boundaries. This is only relevant to filesystems created + with non-zero data alignment parameters (``sunit``, ``swidth``) by + **mkfs(8)**. + + norecovery + The filesystem will be mounted without running log recovery. + If the filesystem was not cleanly unmounted, it is likely to + be inconsistent when mounted in ``norecovery`` mode. + Some files or directories may not be accessible because of this. + Filesystems mounted ``norecovery`` must be mounted read-only or + the mount will fail. + + nouuid + Don't check for double mounted file systems using the file + system ``uuid``. This is useful to mount LVM snapshot volumes, + and often used in combination with ``norecovery`` for mounting + read-only snapshots. + + noquota + Forcibly turns off all quota accounting and enforcement + within the filesystem. + + uquota/usrquota/uqnoenforce/quota + User disk quota accounting enabled, and limits (optionally) + enforced. Refer to **xfs_quota(8)** for further details. + + gquota/grpquota/gqnoenforce + Group disk quota accounting enabled and limits (optionally) + enforced. Refer to **xfs_quota(8)** for further details. + + pquota/prjquota/pqnoenforce + Project disk quota accounting enabled and limits (optionally) + enforced. Refer to **xfs_quota(8)** for further details. + + sunit=value and swidth=value + Used to specify the stripe unit and width for a RAID device + or a stripe volume. "value" must be specified in 512-byte + block units. These options are only relevant to filesystems + that were created with non-zero data alignment parameters. + + The ``sunit`` and ``swidth`` parameters specified must be compatible + with the existing filesystem alignment characteristics. In + general, that means the only valid changes to ``sunit`` are + increasing it by a power-of-2 multiple. Valid ``swidth`` values + are any integer multiple of a valid ``sunit`` value. + + Typically the only time these mount options are necessary if + after an underlying RAID device has had it's geometry + modified, such as adding a new disk to a RAID5 lun and + reshaping it. + + swalloc + Data allocations will be rounded up to stripe width boundaries + when the current end of file is being extended and the file + size is larger than the stripe width size. + + wsync + When specified, all filesystem namespace operations are + executed synchronously. This ensures that when the namespace + operation (create, unlink, etc) completes, the change to the + namespace is on stable storage. This is useful in HA setups + where failover must not result in clients seeing + inconsistent namespace presentation during or after a + failover event. + + +Deprecated Mount Options +======================== + +=========================== ================ + Name Removal Schedule +=========================== ================ +=========================== ================ + + +Removed Mount Options +===================== + +=========================== ======= + Name Removed +=========================== ======= + delaylog/nodelaylog v4.0 + ihashsize v4.0 + irixsgid v4.0 + osyncisdsync/osyncisosync v4.0 + barrier v4.19 + nobarrier v4.19 +=========================== ======= + +sysctls +======= + +The following sysctls are available for the XFS filesystem: + + fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1) + Setting this to "1" clears accumulated XFS statistics + in /proc/fs/xfs/stat. It then immediately resets to "0". + + fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000) + The interval at which the filesystem flushes metadata + out to disk and runs internal cache cleanup routines. + + fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000) + The interval at which the filesystem ages filestreams cache + references and returns timed-out AGs back to the free stream + pool. + + fs.xfs.speculative_prealloc_lifetime + (Units: seconds Min: 1 Default: 300 Max: 86400) + The interval at which the background scanning for inodes + with unused speculative preallocation runs. The scan + removes unused preallocation from clean inodes and releases + the unused space back to the free pool. + + fs.xfs.error_level (Min: 0 Default: 3 Max: 11) + A volume knob for error reporting when internal errors occur. + This will generate detailed messages & backtraces for filesystem + shutdowns, for example. Current threshold values are: + + XFS_ERRLEVEL_OFF: 0 + XFS_ERRLEVEL_LOW: 1 + XFS_ERRLEVEL_HIGH: 5 + + fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) + Causes certain error conditions to call BUG(). Value is a bitmask; + OR together the tags which represent errors which should cause panics: + + XFS_NO_PTAG 0 + XFS_PTAG_IFLUSH 0x00000001 + XFS_PTAG_LOGRES 0x00000002 + XFS_PTAG_AILDELETE 0x00000004 + XFS_PTAG_ERROR_REPORT 0x00000008 + XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 + XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 + XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 + XFS_PTAG_FSBLOCK_ZERO 0x00000080 + XFS_PTAG_VERIFIER_ERROR 0x00000100 + + This option is intended for debugging only. + + fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1) + Controls whether symlinks are created with mode 0777 (default) + or whether their mode is affected by the umask (irix mode). + + fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1) + Controls files created in SGID directories. + If the group ID of the new file does not match the effective group + ID or one of the supplementary group IDs of the parent dir, the + ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl + is set. + + fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) + Setting this to "1" will cause the "sync" flag set + by the **xfs_io(8)** chattr command on a directory to be + inherited by files in that directory. + + fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1) + Setting this to "1" will cause the "nodump" flag set + by the **xfs_io(8)** chattr command on a directory to be + inherited by files in that directory. + + fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1) + Setting this to "1" will cause the "noatime" flag set + by the **xfs_io(8)** chattr command on a directory to be + inherited by files in that directory. + + fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1) + Setting this to "1" will cause the "nosymlinks" flag set + by the **xfs_io(8)** chattr command on a directory to be + inherited by files in that directory. + + fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1) + Setting this to "1" will cause the "nodefrag" flag set + by the **xfs_io(8)** chattr command on a directory to be + inherited by files in that directory. + + fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256) + In "inode32" allocation mode, this option determines how many + files the allocator attempts to allocate in the same allocation + group before moving to the next allocation group. The intent + is to control the rate at which the allocator moves between + allocation groups when allocating extents for new files. + +Deprecated Sysctls +================== + +None at present. + + +Removed Sysctls +=============== + + Name Removed + ---- ------- + fs.xfs.xfsbufd_centisec v4.0 + fs.xfs.age_buffer_centisecs v4.0 + + +Error handling +============== + +XFS can act differently according to the type of error found during its +operation. The implementation introduces the following concepts to the error +handler: + + -failure speed: + Defines how fast XFS should propagate an error upwards when a specific + error is found during the filesystem operation. It can propagate + immediately, after a defined number of retries, after a set time period, + or simply retry forever. + + -error classes: + Specifies the subsystem the error configuration will apply to, such as + metadata IO or memory allocation. Different subsystems will have + different error handlers for which behaviour can be configured. + + -error handlers: + Defines the behavior for a specific error. + +The filesystem behavior during an error can be set via ``sysfs`` files. Each +error handler works independently - the first condition met by an error handler +for a specific class will cause the error to be propagated rather than reset and +retried. + +The action taken by the filesystem when the error is propagated is context +dependent - it may cause a shut down in the case of an unrecoverable error, +it may be reported back to userspace, or it may even be ignored because +there's nothing useful we can with the error or anyone we can report it to (e.g. +during unmount). + +The configuration files are organized into the following hierarchy for each +mounted filesystem: + + /sys/fs/xfs//error/// + +Where: + + The short device name of the mounted filesystem. This is the same device + name that shows up in XFS kernel error messages as "XFS(): ..." + + + The subsystem the error configuration belongs to. As of 4.9, the defined + classes are: + + - "metadata": applies metadata buffer write IO + + + The individual error handler configurations. + + +Each filesystem has "global" error configuration options defined in their top +level directory: + + /sys/fs/xfs//error/ + + fail_at_unmount (Min: 0 Default: 1 Max: 1) + Defines the filesystem error behavior at unmount time. + + If set to a value of 1, XFS will override all other error configurations + during unmount and replace them with "immediate fail" characteristics. + i.e. no retries, no retry timeout. This will always allow unmount to + succeed when there are persistent errors present. + + If set to 0, the configured retry behaviour will continue until all + retries and/or timeouts have been exhausted. This will delay unmount + completion when there are persistent errors, and it may prevent the + filesystem from ever unmounting fully in the case of "retry forever" + handler configurations. + + Note: there is no guarantee that fail_at_unmount can be set while an + unmount is in progress. It is possible that the ``sysfs`` entries are + removed by the unmounting filesystem before a "retry forever" error + handler configuration causes unmount to hang, and hence the filesystem + must be configured appropriately before unmount begins to prevent + unmount hangs. + +Each filesystem has specific error class handlers that define the error +propagation behaviour for specific errors. There is also a "default" error +handler defined, which defines the behaviour for all errors that don't have +specific handlers defined. Where multiple retry constraints are configured for +a single error, the first retry configuration that expires will cause the error +to be propagated. The handler configurations are found in the directory: + + /sys/fs/xfs//error/// + + max_retries (Min: -1 Default: Varies Max: INTMAX) + Defines the allowed number of retries of a specific error before + the filesystem will propagate the error. The retry count for a given + error context (e.g. a specific metadata buffer) is reset every time + there is a successful completion of the operation. + + Setting the value to "-1" will cause XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will make XFS retry the + operation "N" times before propagating the error. + + retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day) + Define the amount of time (in seconds) that the filesystem is + allowed to retry its operations when the specific error is + found. + + Setting the value to "-1" will allow XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the + operation for up to "N" seconds before propagating the error. + +**Note:** The default behaviour for a specific error handler is dependent on both +the class and error context. For example, the default values for +"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults +to "fail immediately" behaviour. This is done because ENODEV is a fatal, +unrecoverable error no matter how many times the metadata IO is retried. diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 6d2c0d340dea..679729442fd2 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -76,7 +76,7 @@ exposure of uninitialized data through mmap. These filesystems may be used for inspiration: - ext2: see Documentation/filesystems/ext2.txt - ext4: see Documentation/filesystems/ext4/ -- xfs: see Documentation/filesystems/xfs.txt +- xfs: see Documentation/admin-guide/xfs.rst Handling Media Errors diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt deleted file mode 100644 index a5cbb5e0e3db..000000000000 --- a/Documentation/filesystems/xfs.txt +++ /dev/null @@ -1,470 +0,0 @@ - -The SGI XFS Filesystem -====================== - -XFS is a high performance journaling filesystem which originated -on the SGI IRIX platform. It is completely multi-threaded, can -support large files and large filesystems, extended attributes, -variable block sizes, is extent based, and makes extensive use of -Btrees (directories, extents, free space) to aid both performance -and scalability. - -Refer to the documentation at https://xfs.wiki.kernel.org/ -for further details. This implementation is on-disk compatible -with the IRIX version of XFS. - - -Mount Options -============= - -When mounting an XFS filesystem, the following options are accepted. -For boolean mount options, the names with the (*) suffix is the -default behaviour. - - allocsize=size - Sets the buffered I/O end-of-file preallocation size when - doing delayed allocation writeout (default size is 64KiB). - Valid values for this option are page size (typically 4KiB) - through to 1GiB, inclusive, in power-of-2 increments. - - The default behaviour is for dynamic end-of-file - preallocation size, which uses a set of heuristics to - optimise the preallocation size based on the current - allocation patterns within the file and the access patterns - to the file. Specifying a fixed allocsize value turns off - the dynamic behaviour. - - attr2 - noattr2 - The options enable/disable an "opportunistic" improvement to - be made in the way inline extended attributes are stored - on-disk. When the new form is used for the first time when - attr2 is selected (either when setting or removing extended - attributes) the on-disk superblock feature bit field will be - updated to reflect this format being in use. - - The default behaviour is determined by the on-disk feature - bit indicating that attr2 behaviour is active. If either - mount option it set, then that becomes the new default used - by the filesystem. - - CRC enabled filesystems always use the attr2 format, and so - will reject the noattr2 mount option if it is set. - - discard - nodiscard (*) - Enable/disable the issuing of commands to let the block - device reclaim space freed by the filesystem. This is - useful for SSD devices, thinly provisioned LUNs and virtual - machine images, but may have a performance impact. - - Note: It is currently recommended that you use the fstrim - application to discard unused blocks rather than the discard - mount option because the performance impact of this option - is quite severe. - - grpid/bsdgroups - nogrpid/sysvgroups (*) - These options define what group ID a newly created file - gets. When grpid is set, it takes the group ID of the - directory in which it is created; otherwise it takes the - fsgid of the current process, unless the directory has the - setgid bit set, in which case it takes the gid from the - parent directory, and also gets the setgid bit set if it is - a directory itself. - - filestreams - Make the data allocator use the filestreams allocation mode - across the entire filesystem rather than just on directories - configured to use it. - - ikeep - noikeep (*) - When ikeep is specified, XFS does not delete empty inode - clusters and keeps them around on disk. When noikeep is - specified, empty inode clusters are returned to the free - space pool. - - inode32 - inode64 (*) - When inode32 is specified, it indicates that XFS limits - inode creation to locations which will not result in inode - numbers with more than 32 bits of significance. - - When inode64 is specified, it indicates that XFS is allowed - to create inodes at any location in the filesystem, - including those which will result in inode numbers occupying - more than 32 bits of significance. - - inode32 is provided for backwards compatibility with older - systems and applications, since 64 bits inode numbers might - cause problems for some applications that cannot handle - large inode numbers. If applications are in use which do - not handle inode numbers bigger than 32 bits, the inode32 - option should be specified. - - - largeio - nolargeio (*) - If "nolargeio" is specified, the optimal I/O reported in - st_blksize by stat(2) will be as small as possible to allow - user applications to avoid inefficient read/modify/write - I/O. This is typically the page size of the machine, as - this is the granularity of the page cache. - - If "largeio" specified, a filesystem that was created with a - "swidth" specified will return the "swidth" value (in bytes) - in st_blksize. If the filesystem does not have a "swidth" - specified but does specify an "allocsize" then "allocsize" - (in bytes) will be returned instead. Otherwise the behaviour - is the same as if "nolargeio" was specified. - - logbufs=value - Set the number of in-memory log buffers. Valid numbers - range from 2-8 inclusive. - - The default value is 8 buffers. - - If the memory cost of 8 log buffers is too high on small - systems, then it may be reduced at some cost to performance - on metadata intensive workloads. The logbsize option below - controls the size of each buffer and so is also relevant to - this case. - - logbsize=value - Set the size of each in-memory log buffer. The size may be - specified in bytes, or in kilobytes with a "k" suffix. - Valid sizes for version 1 and version 2 logs are 16384 (16k) - and 32768 (32k). Valid sizes for version 2 logs also - include 65536 (64k), 131072 (128k) and 262144 (256k). The - logbsize must be an integer multiple of the log - stripe unit configured at mkfs time. - - The default value for for version 1 logs is 32768, while the - default value for version 2 logs is MAX(32768, log_sunit). - - logdev=device and rtdev=device - Use an external log (metadata journal) and/or real-time device. - An XFS filesystem has up to three parts: a data section, a log - section, and a real-time section. The real-time section is - optional, and the log section can be separate from the data - section or contained within it. - - noalign - Data allocations will not be aligned at stripe unit - boundaries. This is only relevant to filesystems created - with non-zero data alignment parameters (sunit, swidth) by - mkfs. - - norecovery - The filesystem will be mounted without running log recovery. - If the filesystem was not cleanly unmounted, it is likely to - be inconsistent when mounted in "norecovery" mode. - Some files or directories may not be accessible because of this. - Filesystems mounted "norecovery" must be mounted read-only or - the mount will fail. - - nouuid - Don't check for double mounted file systems using the file - system uuid. This is useful to mount LVM snapshot volumes, - and often used in combination with "norecovery" for mounting - read-only snapshots. - - noquota - Forcibly turns off all quota accounting and enforcement - within the filesystem. - - uquota/usrquota/uqnoenforce/quota - User disk quota accounting enabled, and limits (optionally) - enforced. Refer to xfs_quota(8) for further details. - - gquota/grpquota/gqnoenforce - Group disk quota accounting enabled and limits (optionally) - enforced. Refer to xfs_quota(8) for further details. - - pquota/prjquota/pqnoenforce - Project disk quota accounting enabled and limits (optionally) - enforced. Refer to xfs_quota(8) for further details. - - sunit=value and swidth=value - Used to specify the stripe unit and width for a RAID device - or a stripe volume. "value" must be specified in 512-byte - block units. These options are only relevant to filesystems - that were created with non-zero data alignment parameters. - - The sunit and swidth parameters specified must be compatible - with the existing filesystem alignment characteristics. In - general, that means the only valid changes to sunit are - increasing it by a power-of-2 multiple. Valid swidth values - are any integer multiple of a valid sunit value. - - Typically the only time these mount options are necessary if - after an underlying RAID device has had it's geometry - modified, such as adding a new disk to a RAID5 lun and - reshaping it. - - swalloc - Data allocations will be rounded up to stripe width boundaries - when the current end of file is being extended and the file - size is larger than the stripe width size. - - wsync - When specified, all filesystem namespace operations are - executed synchronously. This ensures that when the namespace - operation (create, unlink, etc) completes, the change to the - namespace is on stable storage. This is useful in HA setups - where failover must not result in clients seeing - inconsistent namespace presentation during or after a - failover event. - - -Deprecated Mount Options -======================== - - Name Removal Schedule - ---- ---------------- - - -Removed Mount Options -===================== - - Name Removed - ---- ------- - delaylog/nodelaylog v4.0 - ihashsize v4.0 - irixsgid v4.0 - osyncisdsync/osyncisosync v4.0 - barrier v4.19 - nobarrier v4.19 - - -sysctls -======= - -The following sysctls are available for the XFS filesystem: - - fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1) - Setting this to "1" clears accumulated XFS statistics - in /proc/fs/xfs/stat. It then immediately resets to "0". - - fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000) - The interval at which the filesystem flushes metadata - out to disk and runs internal cache cleanup routines. - - fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000) - The interval at which the filesystem ages filestreams cache - references and returns timed-out AGs back to the free stream - pool. - - fs.xfs.speculative_prealloc_lifetime - (Units: seconds Min: 1 Default: 300 Max: 86400) - The interval at which the background scanning for inodes - with unused speculative preallocation runs. The scan - removes unused preallocation from clean inodes and releases - the unused space back to the free pool. - - fs.xfs.error_level (Min: 0 Default: 3 Max: 11) - A volume knob for error reporting when internal errors occur. - This will generate detailed messages & backtraces for filesystem - shutdowns, for example. Current threshold values are: - - XFS_ERRLEVEL_OFF: 0 - XFS_ERRLEVEL_LOW: 1 - XFS_ERRLEVEL_HIGH: 5 - - fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) - Causes certain error conditions to call BUG(). Value is a bitmask; - OR together the tags which represent errors which should cause panics: - - XFS_NO_PTAG 0 - XFS_PTAG_IFLUSH 0x00000001 - XFS_PTAG_LOGRES 0x00000002 - XFS_PTAG_AILDELETE 0x00000004 - XFS_PTAG_ERROR_REPORT 0x00000008 - XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 - XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 - XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 - XFS_PTAG_FSBLOCK_ZERO 0x00000080 - XFS_PTAG_VERIFIER_ERROR 0x00000100 - - This option is intended for debugging only. - - fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1) - Controls whether symlinks are created with mode 0777 (default) - or whether their mode is affected by the umask (irix mode). - - fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1) - Controls files created in SGID directories. - If the group ID of the new file does not match the effective group - ID or one of the supplementary group IDs of the parent dir, the - ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl - is set. - - fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) - Setting this to "1" will cause the "sync" flag set - by the xfs_io(8) chattr command on a directory to be - inherited by files in that directory. - - fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1) - Setting this to "1" will cause the "nodump" flag set - by the xfs_io(8) chattr command on a directory to be - inherited by files in that directory. - - fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1) - Setting this to "1" will cause the "noatime" flag set - by the xfs_io(8) chattr command on a directory to be - inherited by files in that directory. - - fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1) - Setting this to "1" will cause the "nosymlinks" flag set - by the xfs_io(8) chattr command on a directory to be - inherited by files in that directory. - - fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1) - Setting this to "1" will cause the "nodefrag" flag set - by the xfs_io(8) chattr command on a directory to be - inherited by files in that directory. - - fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256) - In "inode32" allocation mode, this option determines how many - files the allocator attempts to allocate in the same allocation - group before moving to the next allocation group. The intent - is to control the rate at which the allocator moves between - allocation groups when allocating extents for new files. - -Deprecated Sysctls -================== - -None at present. - - -Removed Sysctls -=============== - - Name Removed - ---- ------- - fs.xfs.xfsbufd_centisec v4.0 - fs.xfs.age_buffer_centisecs v4.0 - - -Error handling -============== - -XFS can act differently according to the type of error found during its -operation. The implementation introduces the following concepts to the error -handler: - - -failure speed: - Defines how fast XFS should propagate an error upwards when a specific - error is found during the filesystem operation. It can propagate - immediately, after a defined number of retries, after a set time period, - or simply retry forever. - - -error classes: - Specifies the subsystem the error configuration will apply to, such as - metadata IO or memory allocation. Different subsystems will have - different error handlers for which behaviour can be configured. - - -error handlers: - Defines the behavior for a specific error. - -The filesystem behavior during an error can be set via sysfs files. Each -error handler works independently - the first condition met by an error handler -for a specific class will cause the error to be propagated rather than reset and -retried. - -The action taken by the filesystem when the error is propagated is context -dependent - it may cause a shut down in the case of an unrecoverable error, -it may be reported back to userspace, or it may even be ignored because -there's nothing useful we can with the error or anyone we can report it to (e.g. -during unmount). - -The configuration files are organized into the following hierarchy for each -mounted filesystem: - - /sys/fs/xfs//error/// - -Where: - - The short device name of the mounted filesystem. This is the same device - name that shows up in XFS kernel error messages as "XFS(): ..." - - - The subsystem the error configuration belongs to. As of 4.9, the defined - classes are: - - - "metadata": applies metadata buffer write IO - - - The individual error handler configurations. - - -Each filesystem has "global" error configuration options defined in their top -level directory: - - /sys/fs/xfs//error/ - - fail_at_unmount (Min: 0 Default: 1 Max: 1) - Defines the filesystem error behavior at unmount time. - - If set to a value of 1, XFS will override all other error configurations - during unmount and replace them with "immediate fail" characteristics. - i.e. no retries, no retry timeout. This will always allow unmount to - succeed when there are persistent errors present. - - If set to 0, the configured retry behaviour will continue until all - retries and/or timeouts have been exhausted. This will delay unmount - completion when there are persistent errors, and it may prevent the - filesystem from ever unmounting fully in the case of "retry forever" - handler configurations. - - Note: there is no guarantee that fail_at_unmount can be set while an - unmount is in progress. It is possible that the sysfs entries are - removed by the unmounting filesystem before a "retry forever" error - handler configuration causes unmount to hang, and hence the filesystem - must be configured appropriately before unmount begins to prevent - unmount hangs. - -Each filesystem has specific error class handlers that define the error -propagation behaviour for specific errors. There is also a "default" error -handler defined, which defines the behaviour for all errors that don't have -specific handlers defined. Where multiple retry constraints are configuredi for -a single error, the first retry configuration that expires will cause the error -to be propagated. The handler configurations are found in the directory: - - /sys/fs/xfs//error/// - - max_retries (Min: -1 Default: Varies Max: INTMAX) - Defines the allowed number of retries of a specific error before - the filesystem will propagate the error. The retry count for a given - error context (e.g. a specific metadata buffer) is reset every time - there is a successful completion of the operation. - - Setting the value to "-1" will cause XFS to retry forever for this - specific error. - - Setting the value to "0" will cause XFS to fail immediately when the - specific error is reported. - - Setting the value to "N" (where 0 < N < Max) will make XFS retry the - operation "N" times before propagating the error. - - retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day) - Define the amount of time (in seconds) that the filesystem is - allowed to retry its operations when the specific error is - found. - - Setting the value to "-1" will allow XFS to retry forever for this - specific error. - - Setting the value to "0" will cause XFS to fail immediately when the - specific error is reported. - - Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the - operation for up to "N" seconds before propagating the error. - -Note: The default behaviour for a specific error handler is dependent on both -the class and error context. For example, the default values for -"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults -to "fail immediately" behaviour. This is done because ENODEV is a fatal, -unrecoverable error no matter how many times the metadata IO is retried. -- cgit From c6c405336bd3b0ebd1d76aaf9ea88b35dba77e61 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 16 Jul 2019 16:26:39 -0700 Subject: vmcore: add a kernel parameter novmcoredd Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in second kernel"), drivers are allowed to add device related dump data to vmcore as they want by using the device dump API. This has a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new 'novmcoredd' command line option. User can disable device dump to reduce memory usage. This is helpful if device dump is using too much memory, disabling device dump could make sure a regular vmcore without device dump data is still available. [akpm@linux-foundation.org: tweak documentation] [akpm@linux-foundation.org: vmcore.c needs moduleparam.h] Link: http://lkml.kernel.org/r/20190528111856.7276-1-kasong@redhat.com Signed-off-by: Kairui Song Acked-by: Dave Young Reviewed-by: Bhupesh Sharma Cc: Rahul Lakkireddy Cc: "David S . Miller" Cc: Eric Biederman Cc: Alexey Dobriyan Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f8b62360b18c..bf8221abfe0a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2877,6 +2877,17 @@ /sys/module/printk/parameters/console_suspend) to turn on/off it dynamically. + novmcoredd [KNL,KDUMP] + Disable device dump. Device dump allows drivers to + append dump data to vmcore so you can collect driver + specified debug info. Drivers can append the data + without any limit and this data is stored in memory, + so this may cause significant memory stress. Disabling + device dump can help save memory but the driver debug + data will be no longer available. This parameter + is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP + is set. + noaliencache [MM, NUMA, SLAB] Disables the allocation of alien caches in the slab allocator. Saves per-node memory, but will impact performance. -- cgit From 65f50f255349959f15f2761abd17ead8530b2f33 Mon Sep 17 00:00:00 2001 From: Weitao Hou Date: Tue, 16 Jul 2019 16:26:54 -0700 Subject: kernel: fix typos and some coding style in comments fix lenght to length Link: http://lkml.kernel.org/r/20190521050937.4370-1-houweitaoo@gmail.com Signed-off-by: Weitao Hou Acked-by: Kees Cook Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/usb/s3c2410-usb.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt index e45b38ce2986..26c85afd0b53 100644 --- a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt +++ b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt @@ -4,7 +4,7 @@ OHCI Required properties: - compatible: should be "samsung,s3c2410-ohci" for USB host controller - - reg: address and lenght of the controller memory mapped region + - reg: address and length of the controller memory mapped region - interrupts: interrupt number for the USB OHCI controller - clocks: Should reference the bus and host clocks - clock-names: Should contain two strings -- cgit From 6ced9aa7b56baeb241a715df4539e60d5e3118e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jul 2019 16:28:32 -0700 Subject: coda: stop using 'struct timespec' in user API We exchange file timestamps with user space using psdev device read/write operations with a fixed but architecture specific binary layout. On 32-bit systems, this uses a 'timespec' structure that is defined by the C library to contain two 32-bit values for seconds and nanoseconds. As we get ready for the year 2038 overflow of the 32-bit signed seconds, the kernel now uses 64-bit timestamps internally, and user space will do the same change by changing the 'timespec' definition in the future. Unfortunately, this breaks the layout of the coda_vattr structure, so we need to redefine that in terms of something that does not change. I'm introducing a new 'struct vtimespec' structure here that keeps the existing layout, and the same change has to be done in the coda user space copy of linux/coda.h before anyone can use that on a 32-bit architecture with 64-bit time_t. An open question is what should happen to actual times past y2038, as they are now truncated to the last valid date when sent to user space, and interpreted as pre-1970 times when a timestamp with the MSB set is read back into the kernel. Alternatively, we could change the new timespec64_to_coda()/coda_to_timespec64() functions to use a different interpretation and extend the available range further to the future by disallowing past timestamps. This would require more changes in the user space side though. Link: http://lkml.kernel.org/r/562b7324149461743e4fbe2fedbf7c242f7e274a.1558117389.git.jaharkes@cs.cmu.edu Link: https://patchwork.kernel.org/patch/10474735/ Signed-off-by: Arnd Bergmann Signed-off-by: Jan Harkes Acked-by: Jan Harkes Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index 61311356025d..ea5969068895 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,7 +481,10 @@ kernel support. - + struct vtimespec { + long tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ + }; struct coda_vattr { enum coda_vtype va_type; /* vnode type (for create) */ @@ -493,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ + struct vtimespec va_atime; /* time of last access */ + struct vtimespec va_mtime; /* time of last modification */ + struct vtimespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ -- cgit From 5e7c31dfe74703f428220384b2863525957cc160 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:35 -0700 Subject: coda: change Coda's user api to use 64-bit time_t in timespec Move the 32-bit time_t problems to userspace. Link: http://lkml.kernel.org/r/8d089068823bfb292a4020f773922fbd82ffad39.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index ea5969068895..545262c167c3 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,8 +481,8 @@ kernel support. - struct vtimespec { - long tv_sec; /* seconds */ + struct coda_timespec { + int64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; @@ -496,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct vtimespec va_atime; /* time of last access */ - struct vtimespec va_mtime; /* time of last modification */ - struct vtimespec va_ctime; /* time file changed */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ -- cgit From 814bbf49dcd0ad642e7ceb8991e57555c5472cce Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sun, 14 Jul 2019 14:04:14 +0200 Subject: xen: remove tmem driver The Xen tmem (transcendent memory) driver can be removed, as the related Xen hypervisor feature never made it past the "experimental" state and will be removed in future Xen versions (>= 4.13). The xen-selfballoon driver depends on tmem, so it can be removed, too. Signed-off-by: Juergen Gross Acked-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f8b62360b18c..99db4975e6b5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4674,27 +4674,6 @@ Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. - tmem [KNL,XEN] - Enable the Transcendent memory driver if built-in. - - tmem.cleancache=0|1 [KNL, XEN] - Default is on (1). Disable the usage of the cleancache - API to send anonymous pages to the hypervisor. - - tmem.frontswap=0|1 [KNL, XEN] - Default is on (1). Disable the usage of the frontswap - API to send swap pages to the hypervisor. If disabled - the selfballooning and selfshrinking are force disabled. - - tmem.selfballooning=0|1 [KNL, XEN] - Default is on (1). Disable the driving of swap pages - to the hypervisor. - - tmem.selfshrinking=0|1 [KNL, XEN] - Default is on (1). Partial swapoff that immediately - transfers pages from Xen hypervisor back to the - kernel based on different criteria. - topology= [S390] Format: {off | on} Specify if the kernel should make use of the cpu -- cgit From 30978346372e5c43a652cfbd4533c6bd5427c33b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jul 2019 20:02:09 +0800 Subject: x86: Add "nopv" parameter to disable PV extensions In virtualization environment, PV extensions (drivers, interrupts, timers, etc) are enabled in the majority of use cases which is the best option. However, in some cases (kexec not fully working, benchmarking) we want to disable PV extensions. We have "xen_nopv" for that purpose but only for XEN. For a consistent admin experience a common command line parameter "nopv" set across all PV guest implementations is a better choice. There are guest types which just won't work without PV extensions, like Xen PV, Xen PVH and jailhouse. add a "ignore_nopv" member to struct hypervisor_x86 set to true for those guest types and call the detect functions only if nopv is false or ignore_nopv is true. Suggested-by: Juergen Gross Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Jan Kiszka Cc: Boris Ostrovsky Cc: Stefano Stabellini Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 99db4975e6b5..936e8e7e6474 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5257,6 +5257,11 @@ improve timer resolution at the expense of processing more timer interrupts. + nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + Disables the PV optimizations forcing the guest to run + as generic guest with no PV drivers. Currently support + XEN HVM, KVM, HYPER_V and VMWARE guest. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] -- cgit From b39b049749ce08c7756be57082177730617bb9a0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jul 2019 20:02:10 +0800 Subject: xen: Map "xen_nopv" parameter to "nopv" and mark it obsolete Clean up unnecessory code after that operation. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 936e8e7e6474..2d990585402c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5243,6 +5243,8 @@ xen_nopv [X86] Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers. + This option is obsoleted by the "nopv" option, which + has equivalent effect for XEN platform. xen_scrub_pages= [XEN] Boolean option to control scrubbing pages before giving them back -- cgit From 5ef872636ca71d35ddc5ee5951c9272a51c74418 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jul 2019 11:45:58 +0900 Subject: kbuild: get rid of misleading $(AS) from documents The assembler files in the kernel are *.S instead of *.s, so they must be preprocessed. Since 'as' of GNU binutils is not able to preprocess, we always use $(CC) as an assembler driver. $(AS) is almost unused in Kbuild. As of v5.2, there is just one place that directly invokes $(AS). $ git grep -e '$(AS)' -e '${AS}' -e '$AS' -e '$(AS:' -e '${AS:' -- :^Documentation drivers/net/wan/Makefile: AS68K = $(AS) The documentation about *_AFLAGS* sounds like the flags were passed to $(AS). This is somewhat misleading. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- Documentation/kbuild/kbuild.rst | 5 ++--- Documentation/kbuild/makefiles.rst | 12 ++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index b25548963d70..727520b3d7b1 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -38,12 +38,11 @@ Additional options to the assembler (for built-in and modules). AFLAGS_MODULE ------------- -Additional module specific options to use for $(AS). +Additional assembler options for modules. AFLAGS_KERNEL ------------- -Additional options for $(AS) when used for assembler -code for code that is compiled as built-in. +Additional assembler options for built-in. KCFLAGS ------- diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 093f2d79ab95..67e47589d9d2 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -328,7 +328,7 @@ more details, with real examples. variable $(KBUILD_CFLAGS) and uses it for compilation flags for the entire tree. - asflags-y specifies options for assembling with $(AS). + asflags-y specifies assembler options. Example:: @@ -489,7 +489,7 @@ more details, with real examples. as-instr checks if the assembler reports a specific instruction and then outputs either option1 or option2 C escapes are supported in the test instruction - Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options + Note: as-instr-option uses KBUILD_AFLAGS for assembler options cc-option cc-option is used to check if $(CC) supports a given option, and if @@ -905,7 +905,7 @@ When kbuild executes, the following steps are followed (roughly): vmlinux. The usage of $(call if_changed,xxx) will be described later. KBUILD_AFLAGS - $(AS) assembler flags + Assembler flags Default value - see top level Makefile Append or modify as required per architecture. @@ -948,16 +948,16 @@ When kbuild executes, the following steps are followed (roughly): to 'y' when selected. KBUILD_AFLAGS_KERNEL - $(AS) options specific for built-in + Assembler options specific for built-in $(KBUILD_AFLAGS_KERNEL) contains extra C compiler flags used to compile resident kernel code. KBUILD_AFLAGS_MODULE - Options for $(AS) when building modules + Assembler options specific for modules $(KBUILD_AFLAGS_MODULE) is used to add arch-specific options that - are used for $(AS). + are used for assembler. From commandline AFLAGS_MODULE shall be used (see kbuild.txt). -- cgit From 9af93db9e140a4e6e79cdb098919bc928a72cd59 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Wed, 17 Jul 2019 13:10:58 +0800 Subject: platform/x86: asus: Rename "fan mode" to "fan boost mode" The Asus WMI spec indicates that the function being controlled here is called "Fan Boost Mode". The user-facing documentation also calls it this. The spec uses the term "fan mode" is used to refer to other things, including functionality expected to appear on future products. We missed this before as we are not dealing with the most readable of specs, and didn't forsee any confusion around shortening the name. Rename "fan mode" to "fan boost mode" to improve consistency with the spec and to avoid a future naming conflict. There is no interface breakage here since this has yet to be included in an official kernel release. I also updated the kernel version listed under ABI accordingly. Signed-off-by: Daniel Drake Acked-by: Yurii Pavlovskyi Signed-off-by: Andy Shevchenko --- Documentation/ABI/testing/sysfs-platform-asus-wmi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi index 87ae5cc983bf..9e99f2909612 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-wmi +++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi @@ -37,9 +37,9 @@ Contact: "AceLan Kao" Description: Resume on lid open. 1 means on, 0 means off. -What: /sys/devices/platform//fan_mode -Date: Apr 2019 -KernelVersion: 5.2 +What: /sys/devices/platform//fan_boost_mode +Date: Sep 2019 +KernelVersion: 5.3 Contact: "Yurii Pavlovskyi" Description: Fan boost mode: -- cgit From b7dca6dd1e591ad19a9aae716f3898be8063f880 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 17 Jul 2019 15:17:57 +0900 Subject: kbuild: create *.mod with full directory path and remove MODVERDIR While descending directories, Kbuild produces objects for modules, but do not link final *.ko files; it is done in the modpost. To keep track of modules, Kbuild creates a *.mod file in $(MODVERDIR) for every module it is building. Some post-processing steps read the necessary information from *.mod files. This avoids descending into directories again. This mechanism was introduced in 2003 or so. Later, commit 551559e13af1 ("kbuild: implement modules.order") added modules.order. So, we can simply read it out to know all the modules with directory paths. This is easier than parsing the first line of *.mod files. $(MODVERDIR) has a flat directory structure, that is, *.mod files are named only with base names. This is based on the assumption that the module name is unique across the tree. This assumption is really fragile. Stephen Rothwell reported a race condition caused by a module name conflict: https://lkml.org/lkml/2019/5/13/991 In parallel building, two different threads could write to the same $(MODVERDIR)/*.mod simultaneously. Non-unique module names are the source of all kind of troubles, hence commit 3a48a91901c5 ("kbuild: check uniqueness of module names") introduced a new checker script. However, it is still fragile in the build system point of view because this race happens before scripts/modules-check.sh is invoked. If it happens again, the modpost will emit unclear error messages. To fix this issue completely, create *.mod with full directory path so that two threads never attempt to write to the same file. $(MODVERDIR) is no longer needed. Since modules with directory paths are listed in modules.order, Kbuild is still able to find *.mod files without additional descending. I also killed cmd_secanalysis; scripts/mod/sumversion.c computes MD4 hash for modules with MODULE_VERSION(). When CONFIG_DEBUG_SECTION_MISMATCH=y, it occurs not only in the modpost stage, but also during directory descending, where sumversion.c may parse stale *.mod files. It would emit 'No such file or directory' warning when an object consisting a module is renamed, or when a single-obj module is turned into a multi-obj module or vice versa. Signed-off-by: Masahiro Yamada Acked-by: Nicolas Pitre --- Documentation/dontdiff | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 5eba889ea84d..9f4392876099 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -30,6 +30,7 @@ *.lzo *.mo *.moc +*.mod *.mod.c *.o *.o.* -- cgit From c06306696f8368b08774e2a743dbc52d92a61693 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 18 Jul 2019 15:57:27 -0700 Subject: mm: thp: fix false negative of shmem vma's THP eligibility Commit 7635d9cbe832 ("mm, thp, proc: report THP eligibility for each vma") introduced THPeligible bit for processes' smaps. But, when checking the eligibility for shmem vma, __transparent_hugepage_enabled() is called to override the result from shmem_huge_enabled(). It may result in the anonymous vma's THP flag override shmem's. For example, running a simple test which create THP for shmem, but with anonymous THP disabled, when reading the process's smaps, it may show: 7fc92ec00000-7fc92f000000 rw-s 00000000 00:14 27764 /dev/shm/test Size: 4096 kB ... [snip] ... ShmemPmdMapped: 4096 kB ... [snip] ... THPeligible: 0 And, /proc/meminfo does show THP allocated and PMD mapped too: ShmemHugePages: 4096 kB ShmemPmdMapped: 4096 kB This doesn't make too much sense. The shmem objects should be treated separately from anonymous THP. Calling shmem_huge_enabled() with checking MMF_DISABLE_THP sounds good enough. And, we could skip stack and dax vma check since we already checked if the vma is shmem already. Also check if vma is suitable for THP by calling transhuge_vma_suitable(). And minor fix to smaps output format and documentation. Link: http://lkml.kernel.org/r/1560401041-32207-3-git-send-email-yang.shi@linux.alibaba.com Fixes: 7635d9cbe832 ("mm, thp, proc: report THP eligibility for each vma") Signed-off-by: Yang Shi Acked-by: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fb4735fd73b0..99ca040e3f90 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -486,8 +486,8 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. -"THPeligible" indicates whether the mapping is eligible for THP pages - 1 if -true, 0 otherwise. +"THPeligible" indicates whether the mapping is eligible for allocating THP +pages - 1 if true, 0 otherwise. It just shows the current status. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded -- cgit From a0653406a3a671c1609d54835f0443869525ca30 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:29 -0700 Subject: mm: document ZONE_DEVICE memory-model implications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explain the general mechanisms of 'ZONE_DEVICE' pages and list the users of 'devm_memremap_pages()'. [dan.j.williams@intel.com: update ZONE_DEVICE memory model documentation] Link: http://lkml.kernel.org/r/156109575458.1409767.1885676287099277666.stgit@dwillia2-desk3.amr.corp.intel.com Link: http://lkml.kernel.org/r/156092354985.979959.15763234410543451710.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Mike Rapoport Reviewed-by: Mike Rapoport Tested-by: Aneesh Kumar K.V [ppc64] Cc: Jonathan Corbet Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/memory-model.rst | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst index 382f72ace1fc..58a12376b7df 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/vm/memory-model.rst @@ -181,3 +181,43 @@ that is eventually passed to vmemmap_populate() through a long chain of function calls. The vmemmap_populate() implementation may use the `vmem_altmap` along with :c:func:`altmap_alloc_block_buf` helper to allocate memory map on the persistent memory device. + +ZONE_DEVICE +=========== +The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer +`struct page` `mem_map` services for device driver identified physical +address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact +that the page objects for these address ranges are never marked online, +and that a reference must be taken against the device, not just the page +to keep the memory pinned for active use. `ZONE_DEVICE`, via +:c:func:`devm_memremap_pages`, performs just enough memory hotplug to +turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and +:c:func:`get_user_pages` service for the given range of pfns. Since the +page reference count never drops below 1 the page is never tracked as +free memory and the page's `struct list_head lru` space is repurposed +for back referencing to the host device / driver that mapped the memory. + +While `SPARSEMEM` presents memory as a collection of sections, +optionally collected into memory blocks, `ZONE_DEVICE` users have a need +for smaller granularity of populating the `mem_map`. Given that +`ZONE_DEVICE` memory is never marked online it is subsequently never +subject to its memory ranges being exposed through the sysfs memory +hotplug api on memory block boundaries. The implementation relies on +this lack of user-api constraint to allow sub-section sized memory +ranges to be specified to :c:func:`arch_add_memory`, the top-half of +memory hotplug. Sub-section support allows for 2MB as the cross-arch +common alignment granularity for :c:func:`devm_memremap_pages`. + +The users of `ZONE_DEVICE` are: + +* pmem: Map platform persistent memory to be used as a direct-I/O target + via DAX mappings. + +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` + event callbacks to allow a device-driver to coordinate memory management + events related to device-memory, typically GPU memory. See + Documentation/vm/hmm.rst. + +* p2pdma: Create `struct page` objects to allow peer devices in a + PCI/-E topology to coordinate direct-DMA operations between themselves, + i.e. bypass host memory. -- cgit From 69d812f5eb249bf30069f81dd508aa4548ec6eb1 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 6 Jun 2019 15:37:32 +0800 Subject: dt-bindings: interrupt-controller: Update csky mpintc Add trigger type setting for csky,mpintc. The driver also could support #interrupt-cells <1> and it wouldn't invalidate existing DTs. Here we only show the complete format. Signed-off-by: Guo Ren Reviewed-by: Rob Herring Cc: Marc Zyngier --- .../bindings/interrupt-controller/csky,mpintc.txt | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt index ab921f1698fb..e13405355166 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt @@ -6,11 +6,16 @@ C-SKY Multi-processors Interrupt Controller is designed for ck807/ck810/ck860 SMP soc, and it also could be used in non-SMP system. Interrupt number definition: - 0-15 : software irq, and we use 15 as our IPI_IRQ. 16-31 : private irq, and we use 16 as the co-processor timer. 31-1024: common irq for soc ip. +Interrupt triger mode: (Defined in dt-bindings/interrupt-controller/irq.h) + IRQ_TYPE_LEVEL_HIGH (default) + IRQ_TYPE_LEVEL_LOW + IRQ_TYPE_EDGE_RISING + IRQ_TYPE_EDGE_FALLING + ============================= intc node bindings definition ============================= @@ -26,15 +31,22 @@ intc node bindings definition - #interrupt-cells Usage: required Value type: - Definition: must be <1> + Definition: <2> - interrupt-controller: Usage: required -Examples: +Examples: ("interrupts = ") --------- +#include intc: interrupt-controller { compatible = "csky,mpintc"; - #interrupt-cells = <1>; + #interrupt-cells = <2>; interrupt-controller; }; + + device: device-example { + ... + interrupts = <34 IRQ_TYPE_EDGE_RISING>; + interrupt-parent = <&intc>; + }; -- cgit From 4d581034f9086f784a3408575bdb3c201740c6cb Mon Sep 17 00:00:00 2001 From: Mao Han Date: Tue, 4 Jun 2019 18:54:47 +0800 Subject: dt-bindings: csky: Add csky PMU bindings This patch adds the documentation to describe that how to add pmu node in dts. Signed-off-by: Mao Han Signed-off-by: Guo Ren Cc: Rob Herring --- Documentation/devicetree/bindings/csky/pmu.txt | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Documentation/devicetree/bindings/csky/pmu.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/csky/pmu.txt b/Documentation/devicetree/bindings/csky/pmu.txt new file mode 100644 index 000000000000..728d05ca6a1c --- /dev/null +++ b/Documentation/devicetree/bindings/csky/pmu.txt @@ -0,0 +1,38 @@ +=============================== +C-SKY Performance Monitor Units +=============================== + +C-SKY Performance Monitor is designed for ck807/ck810/ck860 SMP soc and +it could count cpu's events for helping analysis performance issues. + +============================ +PMU node bindings definition +============================ + + Description: Describes PMU + + PROPERTIES + + - compatible + Usage: required + Value type: + Definition: must be "csky,csky-pmu" + - interrupts + Usage: required + Value type: + Definition: must be pmu irq num defined by soc + - count-width + Usage: optional + Value type: + Definition: the width of pmu counter + +Examples: +--------- +#include + + pmu: performace-monitor { + compatible = "csky,csky-pmu"; + interrupts = <23 IRQ_TYPE_EDGE_RISING>; + interrupt-parent = <&intc>; + count-width = <48>; + }; -- cgit From 40ef768ab6eecc1b51461a034274350b31fc29d1 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 18 Jul 2019 23:51:56 -0400 Subject: Remove references to dead website. This fell into disrepair a while ago, and the majority of hits to the snapshots were from bots, so it's more trouble to keep running than it's worth. Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- Documentation/dev-tools/sparse.rst | 5 ----- Documentation/translations/zh_CN/sparse.txt | 4 ---- 2 files changed, 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst index c401c952a340..6f4870528226 100644 --- a/Documentation/dev-tools/sparse.rst +++ b/Documentation/dev-tools/sparse.rst @@ -81,11 +81,6 @@ of sparse using git to clone:: git://git.kernel.org/pub/scm/devel/sparse/sparse.git -DaveJ has hourly generated tarballs of the git tree available at:: - - http://www.codemonkey.org.uk/projects/git-snapshots/sparse/ - - Once you have it, just do:: make diff --git a/Documentation/translations/zh_CN/sparse.txt b/Documentation/translations/zh_CN/sparse.txt index 47fc4a06ebe8..0f444b83d639 100644 --- a/Documentation/translations/zh_CN/sparse.txt +++ b/Documentation/translations/zh_CN/sparse.txt @@ -73,10 +73,6 @@ __bitwise"类型。 git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git -DaveJ 把每小时自动生成的 git 源码树 tar 包放在以下地址: - - http://www.codemonkey.org.uk/projects/git-snapshots/sparse/ - 一旦你下载了源码,只要以普通用户身份运行: make -- cgit From 30cd8604323dbaf20a80e797fe7057f5b02e394d Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Thu, 18 Jul 2019 11:38:18 -0700 Subject: KVM: x86: Add fixed counters to PMU filter Updates KVM_CAP_PMU_EVENT_FILTER so it can also whitelist or blacklist fixed counters. Signed-off-by: Eric Hankland [No need to check padding fields for zero. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2cd6250b2896..e54a3f51ddc5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4090,17 +4090,22 @@ Parameters: struct kvm_pmu_event_filter (in) Returns: 0 on success, -1 on error struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u64 events[0]; + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; }; This ioctl restricts the set of PMU events that the guest can program. The argument holds a list of events which will be allowed or denied. The eventsel+umask of each event the guest attempts to program is compared against the events field to determine whether the guest should have access. -This only affects general purpose counters; fixed purpose counters can -be disabled by changing the perfmon CPUID leaf. +The events field only controls general purpose counters; fixed purpose +counters are controlled by the fixed_counter_bitmap. + +No flags are defined yet, the field must be zero. Valid values for 'action': #define KVM_PMU_EVENT_ALLOW 0 -- cgit From 15ffef1ae69e99ebb54326f0220916b1fe619b24 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 3 Jul 2019 14:17:06 -0600 Subject: dt-bindings: Ensure child nodes are of type 'object' Properties which are child node definitions need to have an explict type. Otherwise, a matching (DT) property can silently match when an error is desired. Fix this up tree-wide. Once this is fixed, the meta-schema will enforce this on any child node definitions. Cc: Chen-Yu Tsai Cc: David Woodhouse Cc: Brian Norris Cc: Marek Vasut Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: Linus Walleij Cc: Maxime Coquelin Cc: linux-mtd@lists.infradead.org Cc: linux-gpio@vger.kernel.org Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-spi@vger.kernel.org Acked-by: Miquel Raynal Acked-by: Maxime Ripard Acked-by: Mark Brown Acked-by: Alexandre TORGUE Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml | 1 + Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml | 1 + Documentation/devicetree/bindings/mtd/nand-controller.yaml | 1 + Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 3 +++ Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml | 1 + Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml | 1 + 6 files changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml index fc2f63860cc8..be32f087c529 100644 --- a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml +++ b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml @@ -42,6 +42,7 @@ properties: patternProperties: "^.*@[0-9a-fA-F]+$": + type: object properties: reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml index e5a411518be1..b5b3cf5b1ac2 100644 --- a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml +++ b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml @@ -55,6 +55,7 @@ patternProperties: "^pinctrl-[0-9]+$": true "^nand@[a-f0-9]+$": + type: object properties: reg: minimum: 0 diff --git a/Documentation/devicetree/bindings/mtd/nand-controller.yaml b/Documentation/devicetree/bindings/mtd/nand-controller.yaml index 199ba5ac2a06..d261b7096c69 100644 --- a/Documentation/devicetree/bindings/mtd/nand-controller.yaml +++ b/Documentation/devicetree/bindings/mtd/nand-controller.yaml @@ -40,6 +40,7 @@ properties: patternProperties: "^nand@[a-f0-9]$": + type: object properties: reg: description: diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index 06c4b66c3ee6..3ac5d2088e49 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -55,6 +55,7 @@ properties: patternProperties: '^gpio@[0-9a-f]*$': + type: object properties: gpio-controller: true '#gpio-cells': @@ -113,8 +114,10 @@ patternProperties: - st,bank-name '-[0-9]*$': + type: object patternProperties: '^pins': + type: object description: | A pinctrl node should contain at least one subnode representing the pinctrl group available on the machine. Each subnode will list the diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml index c374fd4923a6..6d1329c28170 100644 --- a/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml +++ b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml @@ -50,6 +50,7 @@ properties: patternProperties: "^.*@[0-9a-f]+": + type: object properties: reg: items: diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml index bda7a5befd8b..f36c46d236d7 100644 --- a/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml +++ b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml @@ -55,6 +55,7 @@ properties: patternProperties: "^.*@[0-9a-f]+": + type: object properties: reg: items: -- cgit From 7d9ef7f37d1f37981344d1a8c8578b67bdf4736a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 26 Jun 2019 17:57:59 -0600 Subject: dt-bindings: riscv: Limit cpus schema to only check RiscV 'cpu' nodes Matching on the 'cpus' node was a bad choice because the schema is incorrectly applied to non-RiscV cpus nodes. As we now have a common cpus schema which checks the general structure, it is also redundant to do so in the Risc-V CPU schema. The downside is one could conceivably mix different architecture's cpu nodes or have typos in the compatible string. The latter problem pretty much exists for every schema. Acked-by: Paul Walmsley Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/riscv/cpus.yaml | 143 +++++++++------------- 1 file changed, 61 insertions(+), 82 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index f97a4ecd7b91..c899111aa5e3 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -10,97 +10,76 @@ maintainers: - Paul Walmsley - Palmer Dabbelt -allOf: - - $ref: /schemas/cpus.yaml# - properties: - $nodename: - const: cpus - description: Container of cpu nodes - - '#address-cells': - const: 1 - description: | - A single unsigned 32-bit integer uniquely identifies each RISC-V - hart in a system. (See the "reg" node under the "cpu" node, - below). - - '#size-cells': - const: 0 + compatible: + items: + - enum: + - sifive,rocket0 + - sifive,e5 + - sifive,e51 + - sifive,u54-mc + - sifive,u54 + - sifive,u5 + - const: riscv + description: + Identifies that the hart uses the RISC-V instruction set + and identifies the type of the hart. + + mmu-type: + allOf: + - $ref: "/schemas/types.yaml#/definitions/string" + - enum: + - riscv,sv32 + - riscv,sv39 + - riscv,sv48 + description: + Identifies the MMU address translation mode used on this + hart. These values originate from the RISC-V Privileged + Specification document, available from + https://riscv.org/specifications/ + + riscv,isa: + allOf: + - $ref: "/schemas/types.yaml#/definitions/string" + - enum: + - rv64imac + - rv64imafdc + description: + Identifies the specific RISC-V instruction set architecture + supported by the hart. These are documented in the RISC-V + User-Level ISA document, available from + https://riscv.org/specifications/ + + timebase-frequency: + type: integer + minimum: 1 + description: + Specifies the clock frequency of the system timer in Hz. + This value is common to all harts on a single system image. + + interrupt-controller: + type: object + description: Describes the CPU's local interrupt controller -patternProperties: - '^cpu@[0-9a-f]+$': properties: - compatible: - type: array - items: - - enum: - - sifive,rocket0 - - sifive,e5 - - sifive,e51 - - sifive,u54-mc - - sifive,u54 - - sifive,u5 - - const: riscv - description: - Identifies that the hart uses the RISC-V instruction set - and identifies the type of the hart. - - mmu-type: - allOf: - - $ref: "/schemas/types.yaml#/definitions/string" - - enum: - - riscv,sv32 - - riscv,sv39 - - riscv,sv48 - description: - Identifies the MMU address translation mode used on this - hart. These values originate from the RISC-V Privileged - Specification document, available from - https://riscv.org/specifications/ - - riscv,isa: - allOf: - - $ref: "/schemas/types.yaml#/definitions/string" - - enum: - - rv64imac - - rv64imafdc - description: - Identifies the specific RISC-V instruction set architecture - supported by the hart. These are documented in the RISC-V - User-Level ISA document, available from - https://riscv.org/specifications/ + '#interrupt-cells': + const: 1 - timebase-frequency: - type: integer - minimum: 1 - description: - Specifies the clock frequency of the system timer in Hz. - This value is common to all harts on a single system image. - - interrupt-controller: - type: object - description: Describes the CPU's local interrupt controller - - properties: - '#interrupt-cells': - const: 1 - - compatible: - const: riscv,cpu-intc - - interrupt-controller: true + compatible: + const: riscv,cpu-intc - required: - - '#interrupt-cells' - - compatible - - interrupt-controller + interrupt-controller: true required: - - riscv,isa - - timebase-frequency + - '#interrupt-cells' + - compatible - interrupt-controller +required: + - riscv,isa + - timebase-frequency + - interrupt-controller + examples: - | // Example 1: SiFive Freedom U540G Development Kit -- cgit From ad21a4ce040cc41b4a085417169b558e86af56b7 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 15 Jul 2019 16:37:25 -0600 Subject: dt-bindings: pinctrl: aspeed: Fix 'compatible' schema errors The Aspeed pinctl schema have errors in the 'compatible' schema: Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml: \ properties:compatible:enum: ['aspeed', 'ast2400-pinctrl', 'aspeed', 'g4-pinctrl'] has non-unique elements Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml: \ properties:compatible:enum: ['aspeed', 'ast2500-pinctrl', 'aspeed', 'g5-pinctrl'] has non-unique elements Flow style sequences have to be quoted if the vales contain ','. Fix this by using the more common one line per entry formatting. Fixes: 0a617de16730 ("dt-bindings: pinctrl: aspeed: Convert AST2500 bindings to json-schema") Fixes: 07457937bb5c ("dt-bindings: pinctrl: aspeed: Convert AST2400 bindings to json-schema") Cc: Andrew Jeffery Cc: Linus Walleij Cc: Joel Stanley Cc: linux-aspeed@lists.ozlabs.org Cc: linux-gpio@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Acked-by: Andrew Jeffery Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml | 4 +++- Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml index 61a110a7db8a..125599a2dc5e 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml @@ -22,7 +22,9 @@ description: |+ properties: compatible: - enum: [ aspeed,ast2400-pinctrl, aspeed,g4-pinctrl ] + enum: + - aspeed,ast2400-pinctrl + - aspeed,g4-pinctrl patternProperties: '^.*$': diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml index cf561bd55128..a464cfa0cba3 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml @@ -22,7 +22,9 @@ description: |+ properties: compatible: - enum: [ aspeed,ast2500-pinctrl, aspeed,g5-pinctrl ] + enum: + - aspeed,ast2500-pinctrl + - aspeed,g5-pinctrl aspeed,external-nodes: minItems: 2 maxItems: 2 -- cgit From fcbe7e3cf86d665bf4924eecb13a5af10bcfa372 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 15 Jul 2019 16:48:41 -0600 Subject: dt-bindings: pinctrl: aspeed: Fix AST2500 example errors The schema examples are now validated against the schema itself. The AST2500 pinctrl schema has a couple of errors: Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.example.dt.yaml: \ example-0: $nodename:0: 'example-0' does not match '^(bus|soc|axi|ahb|apb)(@[0-9a-f]+)?$' Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.example.dt.yaml: \ pinctrl: aspeed,external-nodes: [[1, 2]] is too short Fixes: 0a617de16730 ("dt-bindings: pinctrl: aspeed: Convert AST2500 bindings to json-schema") Cc: Andrew Jeffery Cc: Linus Walleij Cc: Joel Stanley Cc: linux-aspeed@lists.ozlabs.org Cc: linux-gpio@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Acked-by: Andrew Jeffery Signed-off-by: Rob Herring --- .../devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml index a464cfa0cba3..3e6d85318577 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml @@ -76,9 +76,6 @@ required: examples: - | - compatible = "simple-bus"; - ranges; - apb { compatible = "simple-bus"; #address-cells = <1>; @@ -91,7 +88,7 @@ examples: pinctrl: pinctrl { compatible = "aspeed,g5-pinctrl"; - aspeed,external-nodes = <&gfx &lhc>; + aspeed,external-nodes = <&gfx>, <&lhc>; pinctrl_i2c3_default: i2c3_default { function = "I2C3"; -- cgit From fbbf2b6e9b74ffa79bef5e3da91200195045379e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 16 Jul 2019 14:13:29 -0600 Subject: dt-bindings: iio: avia-hx711: Fix avdd-supply typo in example Now that examples are validated against the DT schema, a typo in avia-hx711 example generates a warning: Documentation/devicetree/bindings/iio/adc/avia-hx711.example.dt.yaml: weight: 'avdd-supply' is a required property Fix the typo. Fixes: 5150ec3fe125 ("avia-hx711.yaml: transform DT binding to YAML") Cc: Andreas Klinger Cc: Jonathan Cameron Cc: linux-iio@vger.kernel.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml index 8a4100ceeaf2..d76ece97c76c 100644 --- a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml +++ b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml @@ -61,6 +61,6 @@ examples: compatible = "avia,hx711"; sck-gpios = <&gpio3 10 GPIO_ACTIVE_HIGH>; dout-gpios = <&gpio0 7 GPIO_ACTIVE_HIGH>; - avdd-suppy = <&avdd>; + avdd-supply = <&avdd>; clock-frequency = <100000>; }; -- cgit From 20051f5fdf6770f05d677e2f03b0a2bab6b0fc64 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 16 Jul 2019 14:21:56 -0600 Subject: dt-bindings: iio: ad7124: Fix dtc warnings in example With the conversion to DT schema, the examples are now compiled with dtc. The ad7124 binding example has the following warning: Documentation/devicetree/bindings/iio/adc/adi,ad7124.example.dts:19.11-21: \ Warning (reg_format): /example-0/adc@0:reg: property has invalid length (4 bytes) (#address-cells == 1, #size-cells == 1) There's a default #size-cells and #address-cells values of 1 for examples. For examples needing different values such as this one on a SPI bus, they need to provide a SPI bus parent node. Fixes: 26ae15e62d3c ("Convert AD7124 bindings documentation to YAML format.") Cc: Jonathan Cameron Cc: linux-iio@vger.kernel.org Signed-off-by: Rob Herring --- .../devicetree/bindings/iio/adc/adi,ad7124.yaml | 71 ++++++++++++---------- 1 file changed, 38 insertions(+), 33 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml index cf494a08b837..9692b7f719f5 100644 --- a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml @@ -114,42 +114,47 @@ patternProperties: examples: - | - adc@0 { - compatible = "adi,ad7124-4"; - reg = <0>; - spi-max-frequency = <5000000>; - interrupts = <25 2>; - interrupt-parent = <&gpio>; - refin1-supply = <&adc_vref>; - clocks = <&ad7124_mclk>; - clock-names = "mclk"; - + spi { #address-cells = <1>; #size-cells = <0>; - channel@0 { + adc@0 { + compatible = "adi,ad7124-4"; reg = <0>; - diff-channels = <0 1>; - adi,reference-select = <0>; - adi,buffered-positive; - }; - - channel@1 { - reg = <1>; - bipolar; - diff-channels = <2 3>; - adi,reference-select = <0>; - adi,buffered-positive; - adi,buffered-negative; - }; - - channel@2 { - reg = <2>; - diff-channels = <4 5>; - }; - - channel@3 { - reg = <3>; - diff-channels = <6 7>; + spi-max-frequency = <5000000>; + interrupts = <25 2>; + interrupt-parent = <&gpio>; + refin1-supply = <&adc_vref>; + clocks = <&ad7124_mclk>; + clock-names = "mclk"; + + #address-cells = <1>; + #size-cells = <0>; + + channel@0 { + reg = <0>; + diff-channels = <0 1>; + adi,reference-select = <0>; + adi,buffered-positive; + }; + + channel@1 { + reg = <1>; + bipolar; + diff-channels = <2 3>; + adi,reference-select = <0>; + adi,buffered-positive; + adi,buffered-negative; + }; + + channel@2 { + reg = <2>; + diff-channels = <4 5>; + }; + + channel@3 { + reg = <3>; + diff-channels = <6 7>; + }; }; }; -- cgit From e2297f7c3ab3b68dda2ac732b1767212019d3bdf Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 16 Jul 2019 15:34:40 -0600 Subject: dt-bindings: pinctrl: stm32: Fix missing 'clocks' property in examples Now that examples are validated against the DT schema, an error with required 'clocks' property missing is exposed: Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.example.dt.yaml: \ pinctrl@40020000: gpio@0: 'clocks' is a required property Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.example.dt.yaml: \ pinctrl@50020000: gpio@1000: 'clocks' is a required property Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.example.dt.yaml: \ pinctrl@50020000: gpio@2000: 'clocks' is a required property Add the missing 'clocks' properties to the examples to fix the errors. Fixes: 2c9239c125f0 ("dt-bindings: pinctrl: Convert stm32 pinctrl bindings to json-schema") Cc: Linus Walleij Cc: Maxime Coquelin Cc: linux-gpio@vger.kernel.org Cc: linux-stm32@st-md-mailman.stormreply.com Acked-by: Alexandre TORGUE Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml index 3ac5d2088e49..91d3e78b3395 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml @@ -197,6 +197,7 @@ required: examples: - | #include + #include //Example 1 pinctrl@40020000 { #address-cells = <1>; @@ -210,6 +211,7 @@ examples: #gpio-cells = <2>; reg = <0x0 0x400>; resets = <&reset_ahb1 0>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>; st,bank-name = "GPIOA"; }; }; @@ -227,6 +229,7 @@ examples: #gpio-cells = <2>; reg = <0x1000 0x400>; resets = <&reset_ahb1 0>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>; st,bank-name = "GPIOB"; gpio-ranges = <&pinctrl 0 0 16>; }; @@ -236,6 +239,7 @@ examples: #gpio-cells = <2>; reg = <0x2000 0x400>; resets = <&reset_ahb1 0>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>; st,bank-name = "GPIOC"; ngpios = <5>; gpio-ranges = <&pinctrl 0 16 3>, -- cgit From 1b03bc5c116383b8bc099e8d60978c379196a687 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Jul 2019 23:17:30 -0400 Subject: typo fix: it's d_make_root, not d_make_inode... Signed-off-by: Al Viro --- Documentation/filesystems/porting | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index f6714f7e6bb5..d16e2ef7ae0b 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -436,7 +436,7 @@ for the inode. If d_make_root(inode) is passed a NULL inode it returns NULL and also requires no further error handling. Typical usage is: inode = foofs_new_inode(....); - s->s_root = d_make_inode(inode); + s->s_root = d_make_root(inode); if (!s->s_root) /* Nothing needed for the inode cleanup */ return -ENOMEM; -- cgit